Blame - Lib/email/Header.py - platform/external/python/cpython3

2004-05-09 03:40:17 +0000

[diff] [blame]

1

2

# Author: che@debian.org (Ben Gertzfield), barry@python.org (Barry Warsaw)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

3

4

"""Header encoding and decoding functionality."""

5

6

import re

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

7

import binascii

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

8

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

9

import email.quopriMIME

10

import email.base64MIME

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

11

from email.Errors import HeaderParseError

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

12

from email.Charset import Charset

13

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

14

NL = '\n'

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

15

SPACE = ' '

Barry Warsaw

2003-03-06 16:10:30 +0000

[diff] [blame]

16

USPACE = u' '

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

17

SPACE8 = ' ' * 8

Barry Warsaw

2003-03-06 16:10:30 +0000

[diff] [blame]

18

UEMPTYSTRING = u''

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

MAXLINELEN = 76

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

22

USASCII = Charset('us-ascii')

23

UTF8 = Charset('utf-8')

24

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

25

# Match encoded-word strings in the form =?charset?q?Hello_World?=

26

ecre = re.compile(r'''

27

=\? # literal =?

28

(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset

29

\? # literal ?

30

(?P<encoding>[qb]) # either a "q" or a "b", case insensitive

31

\? # literal ?

32

(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string

33

\?= # literal ?=

34

''', re.VERBOSE | re.IGNORECASE)

35

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

36

# Field name regexp, including trailing colon, but not separating whitespace,

37

# according to RFC 2822. Character range is from tilde to exclamation mark.

38

# For use with .match()

39

fcre = re.compile(r'[\041-\176]+:$')

40

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

# Helpers

_max_append = email.quopriMIME._max_append

def decode_header(header):

49

"""Decode a message header value without converting charset.

50

51

Returns a list of (decoded_string, charset) pairs containing each of the

52

decoded parts of the header. Charset is None for non-encoded parts of the

53

header, otherwise a lower-case string containing the name of the character

54

set specified in the encoded string.

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

55

56

An email.Errors.HeaderParseError may be raised when certain decoding error

57

occurs (e.g. a base64 decoding exception).

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

58

"""

59

# If no encoding, just return the header

60

header = str(header)

61

if not ecre.search(header):

62

return [(header, None)]

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

63

decoded = []

64

dec = ''

65

for line in header.splitlines():

66

# This line might not have an encoding in it

67

if not ecre.search(line):

68

decoded.append((line, None))

69

continue

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

70

parts = ecre.split(line)

71

while parts:

72

unenc = parts.pop(0).strip()

73

if unenc:

74

# Should we continue a long line?

75

if decoded and decoded[-1][1] is None:

Barry Warsaw

671c3e6

2003-03-06 06:37:42 +0000

[diff] [blame]

76

decoded[-1] = (decoded[-1][0] + SPACE + unenc, None)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

77

else:

78

decoded.append((unenc, None))

79

if parts:

80

charset, encoding = [s.lower() for s in parts[0:2]]

81

encoded = parts[2]

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

82

dec = None

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

83

if encoding == 'q':

84

dec = email.quopriMIME.header_decode(encoded)

85

elif encoding == 'b':

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

86

try:

87

dec = email.base64MIME.decode(encoded)

88

except binascii.Error:

89

# Turn this into a higher level exception. BAW: Right

90

# now we throw the lower level exception away but

91

# when/if we get exception chaining, we'll preserve it.

92

raise HeaderParseError

93

if dec is None:

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

94

dec = encoded

95

96

if decoded and decoded[-1][1] == charset:

97

decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])

98

else:

99

decoded.append((dec, charset))

del parts[0:3]

return decoded

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

105

def make_header(decoded_seq, maxlinelen=None, header_name=None,

106

continuation_ws=' '):

107

"""Create a Header from a sequence of pairs as returned by decode_header()

108

109

decode_header() takes a header value string and returns a sequence of

110

pairs of the format (decoded_string, charset) where charset is the string

111

name of the character set.

112

113

This function takes one of those sequence of pairs and returns a Header

114

instance. Optional maxlinelen, header_name, and continuation_ws are as in

115

the Header constructor.

116

"""

117

h = Header(maxlinelen=maxlinelen, header_name=header_name,

118

continuation_ws=continuation_ws)

119

for s, charset in decoded_seq:

Barry Warsaw

15d3739

2002-07-23 04:29:54 +0000

[diff] [blame]

120

# None means us-ascii but we can simply pass it on to h.append()

121

if charset is not None and not isinstance(charset, Charset):

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

122

charset = Charset(charset)

h.append(s, charset)

return h

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

128

class Header:

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

129

def __init__(self, s=None, charset=None,

130

maxlinelen=None, header_name=None,

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

131

continuation_ws=' ', errors='strict'):

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

132

"""Create a MIME-compliant header that can contain many character sets.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

133

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

134

Optional s is the initial header value. If None, the initial header

135

value is not set. You can later append to the header with .append()

136

method calls. s may be a byte string or a Unicode string, but see the

137

.append() documentation for semantics.

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

138

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

139

Optional charset serves two purposes: it has the same meaning as the

140

charset argument to the .append() method. It also sets the default

141

character set for all subsequent .append() calls that omit the charset

142

argument. If charset is not provided in the constructor, the us-ascii

143

charset is used both as s's initial charset and as the default for

144

subsequent .append() calls.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

145

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

146

The maximum line length can be specified explicit via maxlinelen. For

147

splitting the first line to a shorter value (to account for the field

148

header which isn't included in s, e.g. `Subject') pass in the name of

149

the field in header_name. The default maxlinelen is 76.

150

151

continuation_ws must be RFC 2822 compliant folding whitespace (usually

152

either a space or a hard tab) which will be prepended to continuation

153

lines.

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

154

155

errors is passed through to the .append() call.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

156

"""

157

if charset is None:

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

158

charset = USASCII

Barry Warsaw

2002-10-14 15:13:17 +0000

[diff] [blame]

159

if not isinstance(charset, Charset):

160

charset = Charset(charset)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

161

self._charset = charset

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

162

self._continuation_ws = continuation_ws

163

cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

164

# BAW: I believe `chunks' and `maxlinelen' should be non-public.

165

self._chunks = []

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

166

if s is not None:

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

167

self.append(s, charset, errors)

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

168

if maxlinelen is None:

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

169

maxlinelen = MAXLINELEN

170

if header_name is None:

171

# We don't know anything about the field header so the first line

172

# is the same length as subsequent lines.

173

self._firstlinelen = maxlinelen

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

174

else:

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

175

# The first line should be shorter to take into account the field

176

# header. Also subtract off 2 extra for the colon and space.

177

self._firstlinelen = maxlinelen - len(header_name) - 2

178

# Second and subsequent lines should subtract off the length in

179

# columns of the continuation whitespace prefix.

180

self._maxlinelen = maxlinelen - cws_expanded_len

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

181

182

def __str__(self):

183

"""A synonym for self.encode()."""

184

return self.encode()

185

Barry Warsaw

8e69bda

2002-06-29 03:26:58 +0000

[diff] [blame]

186

def __unicode__(self):

187

"""Helper for the built-in unicode function."""

Barry Warsaw

2003-03-06 16:10:30 +0000

[diff] [blame]

188

uchunks = []

189

lastcs = None

190

for s, charset in self._chunks:

191

# We must preserve spaces between encoded and non-encoded word

192

# boundaries, which means for us we need to add a space when we go

193

# from a charset to None/us-ascii, or from None/us-ascii to a

194

# charset. Only do this for the second and subsequent chunks.

195

nextcs = charset

196

if uchunks:

Barry Warsaw

ba1548a

2003-03-30 20:46:47 +0000

[diff] [blame]

197

if lastcs not in (None, 'us-ascii'):

198

if nextcs in (None, 'us-ascii'):

Barry Warsaw

2003-03-06 16:10:30 +0000

[diff] [blame]

199

uchunks.append(USPACE)

200

nextcs = None

Barry Warsaw

ba1548a

2003-03-30 20:46:47 +0000

[diff] [blame]

201

elif nextcs not in (None, 'us-ascii'):

Barry Warsaw

2003-03-06 16:10:30 +0000

[diff] [blame]

202

uchunks.append(USPACE)

203

lastcs = nextcs

204

uchunks.append(unicode(s, str(charset)))

205

return UEMPTYSTRING.join(uchunks)

Barry Warsaw

8e69bda

2002-06-29 03:26:58 +0000

[diff] [blame]

206

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

207

# Rich comparison operators for equality only. BAW: does it make sense to

208

# have or explicitly disable <, <=, >, >= operators?

209

def __eq__(self, other):

210

# other may be a Header or a string. Both are fine so coerce

211

# ourselves to a string, swap the args and do another comparison.

212

return other == self.encode()

213

214

def __ne__(self, other):

215

return not self == other

216

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

217

def append(self, s, charset=None, errors='strict'):

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

218

"""Append a string to the MIME header.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

219

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

220

Optional charset, if given, should be a Charset instance or the name

221

of a character set (which will be converted to a Charset instance). A

222

value of None (the default) means that the charset given in the

223

constructor is used.

224

225

s may be a byte string or a Unicode string. If it is a byte string

Barry Warsaw

2004-05-09 03:40:17 +0000

[diff] [blame]

226

(i.e. isinstance(s, str) is true), then charset is the encoding of

227

that byte string, and a UnicodeError will be raised if the string

Barry Warsaw

4833068

2002-09-30 23:07:35 +0000

[diff] [blame]

228

cannot be decoded with that charset. If s is a Unicode string, then

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

229

charset is a hint specifying the character set of the characters in

230

the string. In this case, when producing an RFC 2822 compliant header

231

using RFC 2047 rules, the Unicode string will be encoded using the

Barry Warsaw

4833068

2002-09-30 23:07:35 +0000

[diff] [blame]

232

following charsets in order: us-ascii, the charset hint, utf-8. The

233

first character set not to provoke a UnicodeError is used.

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

234

235

Optional `errors' is passed as the third argument to any unicode() or

236

ustr.encode() call.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

237

"""

238

if charset is None:

239

charset = self._charset

Barry Warsaw

92825a9

2002-07-23 06:08:10 +0000

[diff] [blame]

240

elif not isinstance(charset, Charset):

241

charset = Charset(charset)

Barry Warsaw

2002-10-14 16:52:41 +0000

[diff] [blame]

242

# If the charset is our faux 8bit charset, leave the string unchanged

243

if charset <> '8bit':

244

# We need to test that the string can be converted to unicode and

245

# back to a byte string, given the input and output codecs of the

246

# charset.

Barry Warsaw

2004-05-09 03:40:17 +0000

[diff] [blame]

247

if isinstance(s, str):

Barry Warsaw

2002-10-14 16:52:41 +0000

[diff] [blame]

248

# Possibly raise UnicodeError if the byte string can't be

249

# converted to a unicode with the input codec of the charset.

250

incodec = charset.input_codec or 'us-ascii'

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

251

ustr = unicode(s, incodec, errors)

Barry Warsaw

2002-10-14 16:52:41 +0000

[diff] [blame]

252

# Now make sure that the unicode could be converted back to a

253

# byte string with the output codec, which may be different

254

# than the iput coded. Still, use the original byte string.

255

outcodec = charset.output_codec or 'us-ascii'

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

256

ustr.encode(outcodec, errors)

Barry Warsaw

2004-05-09 03:40:17 +0000

[diff] [blame]

257

elif isinstance(s, unicode):

Barry Warsaw

2002-10-14 16:52:41 +0000

[diff] [blame]

258

# Now we have to be sure the unicode string can be converted

259

# to a byte string with a reasonable output codec. We want to

260

# use the byte string in the chunk.

261

for charset in USASCII, charset, UTF8:

262

try:

263

outcodec = charset.output_codec or 'us-ascii'

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

264

s = s.encode(outcodec, errors)

Barry Warsaw

2002-10-14 16:52:41 +0000

[diff] [blame]

break

except UnicodeError:

pass

else:

assert False, 'utf-8 conversion failed'

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

270

self._chunks.append((s, charset))

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

271

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

272

def _split(self, s, charset, maxlinelen, splitchars):

Barry Warsaw

2002-10-14 15:13:17 +0000

[diff] [blame]

273

# Split up a header safely for use with encode_chunks.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

274

splittable = charset.to_splittable(s)

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

275

encoded = charset.from_splittable(splittable, True)

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

276

elen = charset.encoded_header_len(encoded)

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

277

# If the line's encoded length first, just return it

278

if elen <= maxlinelen:

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

279

return [(encoded, charset)]

Barry Warsaw

2002-10-14 15:13:17 +0000

[diff] [blame]

280

# If we have undetermined raw 8bit characters sitting in a byte

281

# string, we really don't know what the right thing to do is. We

282

# can't really split it because it might be multibyte data which we

283

# could break if we split it between pairs. The least harm seems to

284

# be to not split the header at all, but that means they could go out

285

# longer than maxlinelen.

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

286

if charset == '8bit':

Barry Warsaw

2002-10-14 15:13:17 +0000

[diff] [blame]

287

return [(s, charset)]

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

288

# BAW: I'm not sure what the right test here is. What we're trying to

289

# do is be faithful to RFC 2822's recommendation that ($2.2.3):

290

#

291

# "Note: Though structured field bodies are defined in such a way that

292

# folding can take place between many of the lexical tokens (and even

293

# within some of the lexical tokens), folding SHOULD be limited to

294

# placing the CRLF at higher-level syntactic breaks."

295

#

296

# For now, I can only imagine doing this when the charset is us-ascii,

297

# although it's possible that other charsets may also benefit from the

298

# higher-level syntactic breaks.

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

299

elif charset == 'us-ascii':

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

300

return self._split_ascii(s, charset, maxlinelen, splitchars)

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

301

# BAW: should we use encoded?

302

elif elen == len(s):

303

# We can split on _maxlinelen boundaries because we know that the

304

# encoding won't change the size of the string

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

305

splitpnt = maxlinelen

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

306

first = charset.from_splittable(splittable[:splitpnt], False)

307

last = charset.from_splittable(splittable[splitpnt:], False)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

308

else:

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

309

# Binary search for split point

310

first, last = _binsplit(splittable, charset, maxlinelen)

311

# first is of the proper length so just wrap it in the appropriate

312

# chrome. last must be recursively split.

313

fsplittable = charset.to_splittable(first)

314

fencoded = charset.from_splittable(fsplittable, True)

315

chunk = [(fencoded, charset)]

316

return chunk + self._split(last, charset, self._maxlinelen, splitchars)

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

317

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

318

def _split_ascii(self, s, charset, firstlen, splitchars):

Barry Warsaw

2003-03-10 15:14:08 +0000

[diff] [blame]

319

chunks = _split_ascii(s, firstlen, self._maxlinelen,

320

self._continuation_ws, splitchars)

321

return zip(chunks, [charset]*len(chunks))

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

322

Barry Warsaw

2003-03-07 15:39:37 +0000

[diff] [blame]

323

def _encode_chunks(self, newchunks, maxlinelen):

Barry Warsaw

0c35825

2002-10-13 04:06:28 +0000

[diff] [blame]

324

# MIME-encode a header with many different charsets and/or encodings.

325

#

326

# Given a list of pairs (string, charset), return a MIME-encoded

327

# string suitable for use in a header field. Each pair may have

328

# different charsets and/or encodings, and the resulting header will

329

# accurately reflect each setting.

330

#

331

# Each encoding can be email.Utils.QP (quoted-printable, for

332

# ASCII-like character sets like iso-8859-1), email.Utils.BASE64

333

# (Base64, for non-ASCII like character sets like KOI8-R and

334

# iso-2022-jp), or None (no encoding).

335

#

336

# Each pair will be represented on a separate line; the resulting

337

# string will be in the format:

338

#

339

# =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n

340

# =?charset2?b?SvxyZ2VuIEL2aW5n?="

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

341

chunks = []

Barry Warsaw

0c35825

2002-10-13 04:06:28 +0000

[diff] [blame]

342

for header, charset in newchunks:

Barry Warsaw

6613fb8

2003-03-17 20:36:20 +0000

[diff] [blame]

343

if not header:

344

continue

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

345

if charset is None or charset.header_encoding is None:

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

346

s = header

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

347

else:

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

348

s = charset.header_encode(header)

Barry Warsaw

2003-03-10 15:14:08 +0000

[diff] [blame]

349

# Don't add more folding whitespace than necessary

350

if chunks and chunks[-1].endswith(' '):

extra = ''

else:

extra = ' '

_max_append(chunks, s, maxlinelen, extra)

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

355

joiner = NL + self._continuation_ws

356

return joiner.join(chunks)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

357

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

358

def encode(self, splitchars=';, '):

Barry Warsaw

4833068

2002-09-30 23:07:35 +0000

[diff] [blame]

359

"""Encode a message header into an RFC-compliant format.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

360

361

There are many issues involved in converting a given string for use in

362

an email header. Only certain character sets are readable in most

363

email clients, and as header strings can only contain a subset of

364

7-bit ASCII, care must be taken to properly convert and encode (with

365

Base64 or quoted-printable) header strings. In addition, there is a

366

75-character length limit on any given encoded header field, so

367

line-wrapping must be performed, even with double-byte character sets.

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

368

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

369

This method will do its best to convert the string to the correct

370

character set used in email, and encode and line wrap it safely with

371

the appropriate scheme for that character set.

372

373

If the given charset is not known or an error occurs during

374

conversion, this function will return the header untouched.

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

375

376

Optional splitchars is a string containing characters to split long

377

ASCII lines on, in rough support of RFC 2822's `highest level

378

syntactic breaks'. This doesn't affect RFC 2047 encoded lines.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

379

"""

380

newchunks = []

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

381

maxlinelen = self._firstlinelen

382

lastlen = 0

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

383

for s, charset in self._chunks:

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

384

# The first bit of the next chunk should be just long enough to

385

# fill the next line. Don't forget the space separating the

386

# encoded words.

387

targetlen = maxlinelen - lastlen - 1

388

if targetlen < charset.encoded_header_len(''):

389

# Stick it on the next line

390

targetlen = maxlinelen

391

newchunks += self._split(s, charset, targetlen, splitchars)

392

lastchunk, lastcharset = newchunks[-1]

393

lastlen = lastcharset.encoded_header_len(lastchunk)

Barry Warsaw

2003-03-07 15:39:37 +0000

[diff] [blame]

394

return self._encode_chunks(newchunks, maxlinelen)

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars):

399

lines = []

400

maxlen = firstlen

401

for line in s.splitlines():

Barry Warsaw

33975ea

2003-03-07 23:24:34 +0000

[diff] [blame]

402

# Ignore any leading whitespace (i.e. continuation whitespace) already

403

# on the line, since we'll be adding our own.

404

line = line.lstrip()

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

405

if len(line) < maxlen:

lines.append(line)

maxlen = restlen

continue

# Attempt to split the line at the highest-level syntactic break

410

# possible. Note that we don't have a lot of smarts about field

411

# syntax; we just try to break on semi-colons, then commas, then

412

# whitespace.

413

for ch in splitchars:

Barry Warsaw

6f3b033

2004-05-10 14:44:04 +0000

[diff] [blame]

414

if ch in line:

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

415

break

416

else:

417

# There's nothing useful to split the line on, not even spaces, so

418

# just append this line unchanged

lines.append(line)

maxlen = restlen

continue

# Now split the line on the character plus trailing whitespace

423

cre = re.compile(r'%s\s*' % ch)

if ch in ';,':

eol = ch

else:

eol = ''

joiner = eol + ' '

joinlen = len(joiner)

430

wslen = len(continuation_ws.replace('\t', SPACE8))

431

this = []

432

linelen = 0

433

for part in cre.split(line):

434

curlen = linelen + max(0, len(this)-1) * joinlen

435

partlen = len(part)

436

onfirstline = not lines

437

# We don't want to split after the field name, if we're on the

438

# first line and the field name is present in the header string.

439

if ch == ' ' and onfirstline and \

440

len(this) == 1 and fcre.match(this[0]):

441

this.append(part)

442

linelen += partlen

443

elif curlen + partlen > maxlen:

444

if this:

445

lines.append(joiner.join(this) + eol)

Barry Warsaw

bd836df

2003-03-06 20:33:04 +0000

[diff] [blame]

446

# If this part is longer than maxlen and we aren't already

447

# splitting on whitespace, try to recursively split this line

448

# on whitespace.

449

if partlen > maxlen and ch <> ' ':

Barry Warsaw

2003-03-10 15:14:08 +0000

[diff] [blame]

450

subl = _split_ascii(part, maxlen, restlen,

Barry Warsaw

2003-03-07 15:39:37 +0000

[diff] [blame]

451

continuation_ws, ' ')

Barry Warsaw

2003-03-07 15:39:37 +0000

[diff] [blame]

452

lines.extend(subl[:-1])

453

this = [subl[-1]]

Barry Warsaw

bd836df

2003-03-06 20:33:04 +0000

[diff] [blame]

454

else:

455

this = [part]

Barry Warsaw

2003-03-07 15:39:37 +0000

[diff] [blame]

456

linelen = wslen + len(this[-1])

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

maxlen = restlen

else:

this.append(part)

linelen += partlen

# Put any left over parts on a line by themselves

462

if this:

463

lines.append(joiner.join(this))

Barry Warsaw

2003-03-10 15:14:08 +0000

[diff] [blame]

464

return lines

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

def _binsplit(splittable, charset, maxlinelen):

i = 0

j = len(splittable)

while i < j:

# Invariants:

# 1. splittable[:k] fits for all k <= i (note that we *assume*,

474

# at the start, that splittable[:0] fits).

475

# 2. splittable[:k] does not fit for any k > j (at the start,

476

# this means we shouldn't look at any k > len(splittable)).

477

# 3. We don't know about splittable[:k] for k in i+1..j.

478

# 4. We want to set i to the largest k that fits, with i <= k <= j.

479

#

480

m = (i+j+1) >> 1 # ceiling((i+j)/2); i < m <= j

481

chunk = charset.from_splittable(splittable[:m], True)

482

chunklen = charset.encoded_header_len(chunk)

483

if chunklen <= maxlinelen:

484

# m is acceptable, so is a new lower bound.

485

i = m

486

else:

Tim Peters

2b48213

2003-03-06 23:41:58 +0000

[diff] [blame]

487

# m is not acceptable, so final i must be < m.

Barry Warsaw