Blame - Lib/email/Header.py - platform/external/python/cpython3

2002-04-10 21:01:31 +0000

[diff] [blame]

1

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

2

# Author: che@debian.org (Ben Gertzfield), barry@zope.com (Barry Warsaw)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

3

4

"""Header encoding and decoding functionality."""

5

6

import re

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

7

import binascii

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

8

from types import StringType, UnicodeType

9

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

10

import email.quopriMIME

11

import email.base64MIME

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

12

from email.Errors import HeaderParseError

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

13

from email.Charset import Charset

14

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

15

try:

Barry Warsaw

1c30aa2

2002-06-01 05:49:17 +0000

[diff] [blame]

16

from email._compat22 import _floordiv

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

17

except SyntaxError:

18

# Python 2.1 spells integer division differently

Barry Warsaw

1c30aa2

2002-06-01 05:49:17 +0000

[diff] [blame]

19

from email._compat21 import _floordiv

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

20

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

try:

True, False

except NameError:

True = 1

False = 0

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

27

CRLFSPACE = '\r\n '

28

CRLF = '\r\n'

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

29

NL = '\n'

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

30

SPACE = ' '

Barry Warsaw

4848805

2003-03-06 16:10:30 +0000

[diff] [blame]

31

USPACE = u' '

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

32

SPACE8 = ' ' * 8

33

EMPTYSTRING = ''

Barry Warsaw

4848805

2003-03-06 16:10:30 +0000

[diff] [blame]

34

UEMPTYSTRING = u''

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

MAXLINELEN = 76

ENCODE = 1

DECODE = 2

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

41

USASCII = Charset('us-ascii')

42

UTF8 = Charset('utf-8')

43

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

44

# Match encoded-word strings in the form =?charset?q?Hello_World?=

45

ecre = re.compile(r'''

46

=\? # literal =?

47

(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset

48

\? # literal ?

49

(?P<encoding>[qb]) # either a "q" or a "b", case insensitive

50

\? # literal ?

51

(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string

52

\?= # literal ?=

53

''', re.VERBOSE | re.IGNORECASE)

54

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

55

pcre = re.compile('([,;])')

56

57

# Field name regexp, including trailing colon, but not separating whitespace,

58

# according to RFC 2822. Character range is from tilde to exclamation mark.

59

# For use with .match()

60

fcre = re.compile(r'[\041-\176]+:$')

61

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

# Helpers

_max_append = email.quopriMIME._max_append

def decode_header(header):

70

"""Decode a message header value without converting charset.

71

72

Returns a list of (decoded_string, charset) pairs containing each of the

73

decoded parts of the header. Charset is None for non-encoded parts of the

74

header, otherwise a lower-case string containing the name of the character

75

set specified in the encoded string.

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

76

77

An email.Errors.HeaderParseError may be raised when certain decoding error

78

occurs (e.g. a base64 decoding exception).

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

79

"""

80

# If no encoding, just return the header

81

header = str(header)

82

if not ecre.search(header):

83

return [(header, None)]

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

84

decoded = []

85

dec = ''

86

for line in header.splitlines():

87

# This line might not have an encoding in it

88

if not ecre.search(line):

89

decoded.append((line, None))

90

continue

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

91

parts = ecre.split(line)

92

while parts:

93

unenc = parts.pop(0).strip()

94

if unenc:

95

# Should we continue a long line?

96

if decoded and decoded[-1][1] is None:

Barry Warsaw

671c3e6

2003-03-06 06:37:42 +0000

[diff] [blame]

97

decoded[-1] = (decoded[-1][0] + SPACE + unenc, None)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

98

else:

99

decoded.append((unenc, None))

100

if parts:

101

charset, encoding = [s.lower() for s in parts[0:2]]

102

encoded = parts[2]

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

103

dec = None

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

104

if encoding == 'q':

105

dec = email.quopriMIME.header_decode(encoded)

106

elif encoding == 'b':

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

107

try:

108

dec = email.base64MIME.decode(encoded)

109

except binascii.Error:

110

# Turn this into a higher level exception. BAW: Right

111

# now we throw the lower level exception away but

112

# when/if we get exception chaining, we'll preserve it.

113

raise HeaderParseError

114

if dec is None:

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

115

dec = encoded

116

117

if decoded and decoded[-1][1] == charset:

118

decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])

119

else:

120

decoded.append((dec, charset))

del parts[0:3]

return decoded

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

126

def make_header(decoded_seq, maxlinelen=None, header_name=None,

127

continuation_ws=' '):

128

"""Create a Header from a sequence of pairs as returned by decode_header()

129

130

decode_header() takes a header value string and returns a sequence of

131

pairs of the format (decoded_string, charset) where charset is the string

132

name of the character set.

133

134

This function takes one of those sequence of pairs and returns a Header

135

instance. Optional maxlinelen, header_name, and continuation_ws are as in

136

the Header constructor.

137

"""

138

h = Header(maxlinelen=maxlinelen, header_name=header_name,

139

continuation_ws=continuation_ws)

140

for s, charset in decoded_seq:

Barry Warsaw

15d3739

2002-07-23 04:29:54 +0000

[diff] [blame]

141

# None means us-ascii but we can simply pass it on to h.append()

142

if charset is not None and not isinstance(charset, Charset):

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

143

charset = Charset(charset)

h.append(s, charset)

return h

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

149

class Header:

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

150

def __init__(self, s=None, charset=None,

151

maxlinelen=None, header_name=None,

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

152

continuation_ws=' ', errors='strict'):

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

153

"""Create a MIME-compliant header that can contain many character sets.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

154

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

155

Optional s is the initial header value. If None, the initial header

156

value is not set. You can later append to the header with .append()

157

method calls. s may be a byte string or a Unicode string, but see the

158

.append() documentation for semantics.

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

159

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

160

Optional charset serves two purposes: it has the same meaning as the

161

charset argument to the .append() method. It also sets the default

162

character set for all subsequent .append() calls that omit the charset

163

argument. If charset is not provided in the constructor, the us-ascii

164

charset is used both as s's initial charset and as the default for

165

subsequent .append() calls.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

166

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

167

The maximum line length can be specified explicit via maxlinelen. For

168

splitting the first line to a shorter value (to account for the field

169

header which isn't included in s, e.g. `Subject') pass in the name of

170

the field in header_name. The default maxlinelen is 76.

171

172

continuation_ws must be RFC 2822 compliant folding whitespace (usually

173

either a space or a hard tab) which will be prepended to continuation

174

lines.

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

175

176

errors is passed through to the .append() call.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

177

"""

178

if charset is None:

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

179

charset = USASCII

Barry Warsaw

2002-10-14 15:13:17 +0000

[diff] [blame]

180

if not isinstance(charset, Charset):

181

charset = Charset(charset)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

182

self._charset = charset

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

183

self._continuation_ws = continuation_ws

184

cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

185

# BAW: I believe `chunks' and `maxlinelen' should be non-public.

186

self._chunks = []

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

187

if s is not None:

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

188

self.append(s, charset, errors)

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

189

if maxlinelen is None:

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

190

maxlinelen = MAXLINELEN

191

if header_name is None:

192

# We don't know anything about the field header so the first line

193

# is the same length as subsequent lines.

194

self._firstlinelen = maxlinelen

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

195

else:

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

196

# The first line should be shorter to take into account the field

197

# header. Also subtract off 2 extra for the colon and space.

198

self._firstlinelen = maxlinelen - len(header_name) - 2

199

# Second and subsequent lines should subtract off the length in

200

# columns of the continuation whitespace prefix.

201

self._maxlinelen = maxlinelen - cws_expanded_len

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

202

203

def __str__(self):

204

"""A synonym for self.encode()."""

205

return self.encode()

206

Barry Warsaw

8e69bda

2002-06-29 03:26:58 +0000

[diff] [blame]

207

def __unicode__(self):

208

"""Helper for the built-in unicode function."""

Barry Warsaw

4848805

2003-03-06 16:10:30 +0000

[diff] [blame]

209

uchunks = []

210

lastcs = None

211

for s, charset in self._chunks:

212

# We must preserve spaces between encoded and non-encoded word

213

# boundaries, which means for us we need to add a space when we go

214

# from a charset to None/us-ascii, or from None/us-ascii to a

215

# charset. Only do this for the second and subsequent chunks.

216

nextcs = charset

217

if uchunks:

218

if lastcs is not None:

219

if nextcs is None or nextcs == 'us-ascii':

220

uchunks.append(USPACE)

221

nextcs = None

222

elif nextcs is not None and nextcs <> 'us-ascii':

223

uchunks.append(USPACE)

224

lastcs = nextcs

225

uchunks.append(unicode(s, str(charset)))

226

return UEMPTYSTRING.join(uchunks)

Barry Warsaw

8e69bda

2002-06-29 03:26:58 +0000

[diff] [blame]

227

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

228

# Rich comparison operators for equality only. BAW: does it make sense to

229

# have or explicitly disable <, <=, >, >= operators?

230

def __eq__(self, other):

231

# other may be a Header or a string. Both are fine so coerce

232

# ourselves to a string, swap the args and do another comparison.

233

return other == self.encode()

234

235

def __ne__(self, other):

236

return not self == other

237

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

238

def append(self, s, charset=None, errors='strict'):

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

239

"""Append a string to the MIME header.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

240

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

241

Optional charset, if given, should be a Charset instance or the name

242

of a character set (which will be converted to a Charset instance). A

243

value of None (the default) means that the charset given in the

244

constructor is used.

245

246

s may be a byte string or a Unicode string. If it is a byte string

247

(i.e. isinstance(s, StringType) is true), then charset is the encoding

248

of that byte string, and a UnicodeError will be raised if the string

Barry Warsaw

4833068

2002-09-30 23:07:35 +0000

[diff] [blame]

249

cannot be decoded with that charset. If s is a Unicode string, then

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

250

charset is a hint specifying the character set of the characters in

251

the string. In this case, when producing an RFC 2822 compliant header

252

using RFC 2047 rules, the Unicode string will be encoded using the

Barry Warsaw

4833068

2002-09-30 23:07:35 +0000

[diff] [blame]

253

following charsets in order: us-ascii, the charset hint, utf-8. The

254

first character set not to provoke a UnicodeError is used.

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

255

256

Optional `errors' is passed as the third argument to any unicode() or

257

ustr.encode() call.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

258

"""

259

if charset is None:

260

charset = self._charset

Barry Warsaw

92825a9

2002-07-23 06:08:10 +0000

[diff] [blame]

261

elif not isinstance(charset, Charset):

262

charset = Charset(charset)

Barry Warsaw

2002-10-14 16:52:41 +0000

[diff] [blame]

263

# If the charset is our faux 8bit charset, leave the string unchanged

264

if charset <> '8bit':

265

# We need to test that the string can be converted to unicode and

266

# back to a byte string, given the input and output codecs of the

267

# charset.

268

if isinstance(s, StringType):

269

# Possibly raise UnicodeError if the byte string can't be

270

# converted to a unicode with the input codec of the charset.

271

incodec = charset.input_codec or 'us-ascii'

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

272

ustr = unicode(s, incodec, errors)

Barry Warsaw

2002-10-14 16:52:41 +0000

[diff] [blame]

273

# Now make sure that the unicode could be converted back to a

274

# byte string with the output codec, which may be different

275

# than the iput coded. Still, use the original byte string.

276

outcodec = charset.output_codec or 'us-ascii'

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

277

ustr.encode(outcodec, errors)

Barry Warsaw

2002-10-14 16:52:41 +0000

[diff] [blame]

278

elif isinstance(s, UnicodeType):

279

# Now we have to be sure the unicode string can be converted

280

# to a byte string with a reasonable output codec. We want to

281

# use the byte string in the chunk.

282

for charset in USASCII, charset, UTF8:

283

try:

284

outcodec = charset.output_codec or 'us-ascii'

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

285

s = s.encode(outcodec, errors)

Barry Warsaw

2002-10-14 16:52:41 +0000

[diff] [blame]

break

except UnicodeError:

pass

else:

assert False, 'utf-8 conversion failed'

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

291

self._chunks.append((s, charset))

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

292

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

293

def _split(self, s, charset, maxlinelen, splitchars):

Barry Warsaw

2002-10-14 15:13:17 +0000

[diff] [blame]

294

# Split up a header safely for use with encode_chunks.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

295

splittable = charset.to_splittable(s)

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

296

encoded = charset.from_splittable(splittable, True)

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

297

elen = charset.encoded_header_len(encoded)

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

298

# If the line's encoded length first, just return it

299

if elen <= maxlinelen:

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

300

return [(encoded, charset)]

Barry Warsaw

2002-10-14 15:13:17 +0000

[diff] [blame]

301

# If we have undetermined raw 8bit characters sitting in a byte

302

# string, we really don't know what the right thing to do is. We

303

# can't really split it because it might be multibyte data which we

304

# could break if we split it between pairs. The least harm seems to

305

# be to not split the header at all, but that means they could go out

306

# longer than maxlinelen.

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

307

if charset == '8bit':

Barry Warsaw

2002-10-14 15:13:17 +0000

[diff] [blame]

308

return [(s, charset)]

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

309

# BAW: I'm not sure what the right test here is. What we're trying to

310

# do is be faithful to RFC 2822's recommendation that ($2.2.3):

311

#

312

# "Note: Though structured field bodies are defined in such a way that

313

# folding can take place between many of the lexical tokens (and even

314

# within some of the lexical tokens), folding SHOULD be limited to

315

# placing the CRLF at higher-level syntactic breaks."

316

#

317

# For now, I can only imagine doing this when the charset is us-ascii,

318

# although it's possible that other charsets may also benefit from the

319

# higher-level syntactic breaks.

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

320

elif charset == 'us-ascii':

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

321

return self._split_ascii(s, charset, maxlinelen, splitchars)

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

322

# BAW: should we use encoded?

323

elif elen == len(s):

324

# We can split on _maxlinelen boundaries because we know that the

325

# encoding won't change the size of the string

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

326

splitpnt = maxlinelen

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

327

first = charset.from_splittable(splittable[:splitpnt], False)

328

last = charset.from_splittable(splittable[splitpnt:], False)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

329

else:

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

330

# Binary search for split point

331

first, last = _binsplit(splittable, charset, maxlinelen)

332

# first is of the proper length so just wrap it in the appropriate

333

# chrome. last must be recursively split.

334

fsplittable = charset.to_splittable(first)

335

fencoded = charset.from_splittable(fsplittable, True)

336

chunk = [(fencoded, charset)]

337

return chunk + self._split(last, charset, self._maxlinelen, splitchars)

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

338

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

339

def _split_ascii(self, s, charset, firstlen, splitchars):

340

line = _split_ascii(s, firstlen, self._maxlinelen,

341

self._continuation_ws, splitchars)

342

lines = line.splitlines()

343

return zip(lines, [charset]*len(lines))

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

344

Barry Warsaw

2003-03-07 15:39:37 +0000

[diff] [blame^]

345

def _encode_chunks(self, newchunks, maxlinelen):

Barry Warsaw

0c35825

2002-10-13 04:06:28 +0000

[diff] [blame]

346

# MIME-encode a header with many different charsets and/or encodings.

347

#

348

# Given a list of pairs (string, charset), return a MIME-encoded

349

# string suitable for use in a header field. Each pair may have

350

# different charsets and/or encodings, and the resulting header will

351

# accurately reflect each setting.

352

#

353

# Each encoding can be email.Utils.QP (quoted-printable, for

354

# ASCII-like character sets like iso-8859-1), email.Utils.BASE64

355

# (Base64, for non-ASCII like character sets like KOI8-R and

356

# iso-2022-jp), or None (no encoding).

357

#

358

# Each pair will be represented on a separate line; the resulting

359

# string will be in the format:

360

#

361

# =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n

362

# =?charset2?b?SvxyZ2VuIEL2aW5n?="

363

#

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

364

chunks = []

Barry Warsaw

0c35825

2002-10-13 04:06:28 +0000

[diff] [blame]

365

for header, charset in newchunks:

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

366

if charset is None or charset.header_encoding is None:

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

367

s = header

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

368

else:

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

369

s = charset.header_encode(header)

Barry Warsaw

2003-03-07 15:39:37 +0000

[diff] [blame^]

370

_max_append(chunks, s, maxlinelen, ' ')

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

371

joiner = NL + self._continuation_ws

372

return joiner.join(chunks)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

373

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

374

def encode(self, splitchars=';, '):

Barry Warsaw

4833068

2002-09-30 23:07:35 +0000

[diff] [blame]

375

"""Encode a message header into an RFC-compliant format.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

376

377

There are many issues involved in converting a given string for use in

378

an email header. Only certain character sets are readable in most

379

email clients, and as header strings can only contain a subset of

380

7-bit ASCII, care must be taken to properly convert and encode (with

381

Base64 or quoted-printable) header strings. In addition, there is a

382

75-character length limit on any given encoded header field, so

383

line-wrapping must be performed, even with double-byte character sets.

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

384

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

385

This method will do its best to convert the string to the correct

386

character set used in email, and encode and line wrap it safely with

387

the appropriate scheme for that character set.

388

389

If the given charset is not known or an error occurs during

390

conversion, this function will return the header untouched.

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

391

392

Optional splitchars is a string containing characters to split long

393

ASCII lines on, in rough support of RFC 2822's `highest level

394

syntactic breaks'. This doesn't affect RFC 2047 encoded lines.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

395

"""

396

newchunks = []

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

397

maxlinelen = self._firstlinelen

398

lastlen = 0

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

399

for s, charset in self._chunks:

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

400

# The first bit of the next chunk should be just long enough to

401

# fill the next line. Don't forget the space separating the

402

# encoded words.

403

targetlen = maxlinelen - lastlen - 1

404

if targetlen < charset.encoded_header_len(''):

405

# Stick it on the next line

406

targetlen = maxlinelen

407

newchunks += self._split(s, charset, targetlen, splitchars)

408

lastchunk, lastcharset = newchunks[-1]

409

lastlen = lastcharset.encoded_header_len(lastchunk)

Barry Warsaw

2003-03-07 15:39:37 +0000

[diff] [blame^]

410

return self._encode_chunks(newchunks, maxlinelen)

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars):

Barry Warsaw

2003-03-07 15:39:37 +0000

[diff] [blame^]

415

linejoiner = '\n' + continuation_ws

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

416

lines = []

417

maxlen = firstlen

418

for line in s.splitlines():

419

if len(line) < maxlen:

lines.append(line)

maxlen = restlen

continue

# Attempt to split the line at the highest-level syntactic break

424

# possible. Note that we don't have a lot of smarts about field

425

# syntax; we just try to break on semi-colons, then commas, then

426

# whitespace.

427

for ch in splitchars:

428

if line.find(ch) >= 0:

429

break

430

else:

431

# There's nothing useful to split the line on, not even spaces, so

432

# just append this line unchanged

lines.append(line)

maxlen = restlen

continue

# Now split the line on the character plus trailing whitespace

437

cre = re.compile(r'%s\s*' % ch)

if ch in ';,':

eol = ch

else:

eol = ''

joiner = eol + ' '

joinlen = len(joiner)

444

wslen = len(continuation_ws.replace('\t', SPACE8))

445

this = []

446

linelen = 0

447

for part in cre.split(line):

448

curlen = linelen + max(0, len(this)-1) * joinlen

449

partlen = len(part)

450

onfirstline = not lines

451

# We don't want to split after the field name, if we're on the

452

# first line and the field name is present in the header string.

453

if ch == ' ' and onfirstline and \

454

len(this) == 1 and fcre.match(this[0]):

455

this.append(part)

456

linelen += partlen

457

elif curlen + partlen > maxlen:

458

if this:

459

lines.append(joiner.join(this) + eol)

Barry Warsaw

bd836df

2003-03-06 20:33:04 +0000

[diff] [blame]

460

# If this part is longer than maxlen and we aren't already

461

# splitting on whitespace, try to recursively split this line

462

# on whitespace.

463

if partlen > maxlen and ch <> ' ':

Barry Warsaw

2003-03-07 15:39:37 +0000

[diff] [blame^]

464

subs = _split_ascii(part, maxlen, restlen,

465

continuation_ws, ' ')

466

subl = re.split(linejoiner, subs)

467

lines.extend(subl[:-1])

468

this = [subl[-1]]

Barry Warsaw

bd836df

2003-03-06 20:33:04 +0000

[diff] [blame]

469

else:

470

this = [part]

Barry Warsaw

2003-03-07 15:39:37 +0000

[diff] [blame^]

471

linelen = wslen + len(this[-1])

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

maxlen = restlen

else:

this.append(part)

linelen += partlen

# Put any left over parts on a line by themselves

477

if this:

478

lines.append(joiner.join(this))

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

479

return linejoiner.join(lines)

def _binsplit(splittable, charset, maxlinelen):

i = 0

j = len(splittable)

while i < j:

# Invariants:

# 1. splittable[:k] fits for all k <= i (note that we *assume*,

489

# at the start, that splittable[:0] fits).

490

# 2. splittable[:k] does not fit for any k > j (at the start,

491

# this means we shouldn't look at any k > len(splittable)).

492

# 3. We don't know about splittable[:k] for k in i+1..j.

493

# 4. We want to set i to the largest k that fits, with i <= k <= j.

494

#

495

m = (i+j+1) >> 1 # ceiling((i+j)/2); i < m <= j

496

chunk = charset.from_splittable(splittable[:m], True)

497

chunklen = charset.encoded_header_len(chunk)

498

if chunklen <= maxlinelen:

499

# m is acceptable, so is a new lower bound.

500

i = m

501

else:

Tim Peters

2b48213

2003-03-06 23:41:58 +0000

[diff] [blame]

502

# m is not acceptable, so final i must be < m.

Barry Warsaw