Blame - Lib/email/header.py - platform/external/python/cpython3

2007-08-30 01:15:14 +0000

[diff] [blame]

1

2

# Author: Ben Gertzfield, Barry Warsaw

3

# Contact: email-sig@python.org

4

5

"""Header encoding and decoding functionality."""

__all__ = [

'Header',

'decode_header',

'make_header',

]

import re

import binascii

import email.quoprimime

17

import email.base64mime

18

19

from email.errors import HeaderParseError

R. David Murray

9253214

2011-01-07 23:25:30 +0000

[diff] [blame]

20

from email import charset as _charset

21

Charset = _charset.Charset

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

NL = '\n'

SPACE = ' '

BSPACE = b' '

SPACE8 = ' ' * 8

EMPTYSTRING = ''

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

28

MAXLINELEN = 78

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

29

FWS = ' \t'

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

30

31

USASCII = Charset('us-ascii')

32

UTF8 = Charset('utf-8')

33

34

# Match encoded-word strings in the form =?charset?q?Hello_World?=

35

ecre = re.compile(r'''

36

=\? # literal =?

37

(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset

38

\? # literal ?

39

(?P<encoding>[qb]) # either a "q" or a "b", case insensitive

40

\? # literal ?

41

(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string

42

\?= # literal ?=

43

(?=[ \t]|$) # whitespace or the end of the string

44

''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)

45

46

# Field name regexp, including trailing colon, but not separating whitespace,

47

# according to RFC 2822. Character range is from tilde to exclamation mark.

48

# For use with .match()

49

fcre = re.compile(r'[\041-\176]+:$')

50

Ezio Melotti

1392500

2011-03-16 11:05:33 +0200

[diff] [blame]

51

# Find a header embedded in a putative header value. Used to check for

R. David Murray

5b2d9dd

2011-01-09 02:35:24 +0000

[diff] [blame]

52

# header injection attack.

53

_embeded_header = re.compile(r'\n[^ \t]+:')

54

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

# Helpers

_max_append = email.quoprimime._max_append

def decode_header(header):

63

"""Decode a message header value without converting charset.

64

65

Returns a list of (string, charset) pairs containing each of the decoded

66

parts of the header. Charset is None for non-encoded parts of the header,

67

otherwise a lower-case string containing the name of the character set

68

specified in the encoded string.

69

R David Murray

041015c

2011-03-25 15:10:55 -0400

[diff] [blame]

70

header may be a string that may or may not contain RFC2047 encoded words,

71

or it may be a Header object.

72

Amaury Forgeot d'Arc

1c25de6

2009-07-12 16:43:19 +0000

[diff] [blame]

73

An email.errors.HeaderParseError may be raised when certain decoding error

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

74

occurs (e.g. a base64 decoding exception).

75

"""

R David Murray

6bdb176

2011-06-18 12:30:55 -0400

[diff] [blame]

76

# If it is a Header object, we can just return the encoded chunks.

R David Murray

041015c

2011-03-25 15:10:55 -0400

[diff] [blame]

77

if hasattr(header, '_chunks'):

R David Murray

6bdb176

2011-06-18 12:30:55 -0400

[diff] [blame]

78

return [(_charset._encode(string, str(charset)), str(charset))

79

for string, charset in header._chunks]

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

80

# If no encoding, just return the header with no charset.

81

if not ecre.search(header):

82

return [(header, None)]

83

# First step is to parse all the encoded parts into triplets of the form

84

# (encoded_string, encoding, charset). For unencoded strings, the last

85

# two parts will be None.

86

words = []

87

for line in header.splitlines():

88

parts = ecre.split(line)

89

while parts:

90

unencoded = parts.pop(0).strip()

91

if unencoded:

92

words.append((unencoded, None, None))

93

if parts:

94

charset = parts.pop(0).lower()

95

encoding = parts.pop(0).lower()

96

encoded = parts.pop(0)

97

words.append((encoded, encoding, charset))

98

# The next step is to decode each encoded word by applying the reverse

99

# base64 or quopri transformation. decoded_words is now a list of the

100

# form (decoded_word, charset).

101

decoded_words = []

102

for encoded_string, encoding, charset in words:

103

if encoding is None:

104

# This is an unencoded word.

105

decoded_words.append((encoded_string, charset))

106

elif encoding == 'q':

107

word = email.quoprimime.header_decode(encoded_string)

108

decoded_words.append((word, charset))

109

elif encoding == 'b':

R. David Murray

c4e69cc

2010-08-03 22:14:10 +0000

[diff] [blame]

110

paderr = len(encoded_string) % 4 # Postel's law: add missing padding

111

if paderr:

112

encoded_string += '==='[:4 - paderr]

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

113

try:

114

word = email.base64mime.decode(encoded_string)

115

except binascii.Error:

116

raise HeaderParseError('Base64 decoding error')

117

else:

118

decoded_words.append((word, charset))

119

else:

120

raise AssertionError('Unexpected encoding: ' + encoding)

121

# Now convert all words to bytes and collapse consecutive runs of

122

# similarly encoded words.

123

collapsed = []

124

last_word = last_charset = None

125

for word, charset in decoded_words:

126

if isinstance(word, str):

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

127

word = bytes(word, 'raw-unicode-escape')

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

128

if last_word is None:

129

last_word = word

130

last_charset = charset

131

elif charset != last_charset:

132

collapsed.append((last_word, last_charset))

133

last_word = word

134

last_charset = charset

135

elif last_charset is None:

136

last_word += BSPACE + word

137

else:

138

last_word += word

139

collapsed.append((last_word, last_charset))

return collapsed

def make_header(decoded_seq, maxlinelen=None, header_name=None,

145

continuation_ws=' '):

146

"""Create a Header from a sequence of pairs as returned by decode_header()

147

148

decode_header() takes a header value string and returns a sequence of

149

pairs of the format (decoded_string, charset) where charset is the string

150

name of the character set.

151

152

This function takes one of those sequence of pairs and returns a Header

153

instance. Optional maxlinelen, header_name, and continuation_ws are as in

154

the Header constructor.

155

"""

156

h = Header(maxlinelen=maxlinelen, header_name=header_name,

157

continuation_ws=continuation_ws)

158

for s, charset in decoded_seq:

159

# None means us-ascii but we can simply pass it on to h.append()

160

if charset is not None and not isinstance(charset, Charset):

161

charset = Charset(charset)

h.append(s, charset)

return h

class Header:

def __init__(self, s=None, charset=None,

169

maxlinelen=None, header_name=None,

170

continuation_ws=' ', errors='strict'):

171

"""Create a MIME-compliant header that can contain many character sets.

172

173

Optional s is the initial header value. If None, the initial header

174

value is not set. You can later append to the header with .append()

175

method calls. s may be a byte string or a Unicode string, but see the

176

.append() documentation for semantics.

177

178

Optional charset serves two purposes: it has the same meaning as the

179

charset argument to the .append() method. It also sets the default

180

character set for all subsequent .append() calls that omit the charset

181

argument. If charset is not provided in the constructor, the us-ascii

182

charset is used both as s's initial charset and as the default for

183

subsequent .append() calls.

184

R. David Murray

4c1da4c

2010-12-29 16:57:24 +0000

[diff] [blame]

185

The maximum line length can be specified explicitly via maxlinelen. For

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

186

splitting the first line to a shorter value (to account for the field

187

header which isn't included in s, e.g. `Subject') pass in the name of

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

188

the field in header_name. The default maxlinelen is 78 as recommended

189

by RFC 2822.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

190

191

continuation_ws must be RFC 2822 compliant folding whitespace (usually

192

either a space or a hard tab) which will be prepended to continuation

193

lines.

194

195

errors is passed through to the .append() call.

"""

if charset is None:

charset = USASCII

elif not isinstance(charset, Charset):

200

charset = Charset(charset)

201

self._charset = charset

202

self._continuation_ws = continuation_ws

203

self._chunks = []

204

if s is not None:

205

self.append(s, charset, errors)

206

if maxlinelen is None:

207

maxlinelen = MAXLINELEN

208

self._maxlinelen = maxlinelen

209

if header_name is None:

210

self._headerlen = 0

211

else:

212

# Take the separating colon and space into account.

213

self._headerlen = len(header_name) + 2

214

215

def __str__(self):

216

"""Return the string value of the header."""

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

217

self._normalize()

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

218

uchunks = []

219

lastcs = None

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

220

for string, charset in self._chunks:

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

221

# We must preserve spaces between encoded and non-encoded word

222

# boundaries, which means for us we need to add a space when we go

223

# from a charset to None/us-ascii, or from None/us-ascii to a

224

# charset. Only do this for the second and subsequent chunks.

225

nextcs = charset

R. David Murray

9253214

2011-01-07 23:25:30 +0000

[diff] [blame]

226

if nextcs == _charset.UNKNOWN8BIT:

227

original_bytes = string.encode('ascii', 'surrogateescape')

228

string = original_bytes.decode('ascii', 'replace')

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

229

if uchunks:

230

if lastcs not in (None, 'us-ascii'):

231

if nextcs in (None, 'us-ascii'):

232

uchunks.append(SPACE)

233

nextcs = None

234

elif nextcs not in (None, 'us-ascii'):

235

uchunks.append(SPACE)

236

lastcs = nextcs

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

237

uchunks.append(string)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

238

return EMPTYSTRING.join(uchunks)

239

240

# Rich comparison operators for equality only. BAW: does it make sense to

241

# have or explicitly disable <, <=, >, >= operators?

242

def __eq__(self, other):

243

# other may be a Header or a string. Both are fine so coerce

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

244

# ourselves to a unicode (of the unencoded header value), swap the

245

# args and do another comparison.

246

return other == str(self)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

247

248

def __ne__(self, other):

249

return not self == other

250

251

def append(self, s, charset=None, errors='strict'):

252

"""Append a string to the MIME header.

253

254

Optional charset, if given, should be a Charset instance or the name

255

of a character set (which will be converted to a Charset instance). A

256

value of None (the default) means that the charset given in the

257

constructor is used.

258

259

s may be a byte string or a Unicode string. If it is a byte string

R. David Murray

4c1da4c

2010-12-29 16:57:24 +0000

[diff] [blame]

260

(i.e. isinstance(s, str) is false), then charset is the encoding of

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

261

that byte string, and a UnicodeError will be raised if the string

262

cannot be decoded with that charset. If s is a Unicode string, then

263

charset is a hint specifying the character set of the characters in

R. David Murray

2011-01-05 01:39:32 +0000

[diff] [blame]

264

the string. In either case, when producing an RFC 2822 compliant

265

header using RFC 2047 rules, the string will be encoded using the

266

output codec of the charset. If the string cannot be encoded to the

267

output codec, a UnicodeError will be raised.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

268

R. David Murray

2011-01-05 01:39:32 +0000

[diff] [blame]

269

Optional `errors' is passed as the errors argument to the decode

270

call if s is a byte string.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

271

"""

272

if charset is None:

273

charset = self._charset

274

elif not isinstance(charset, Charset):

275

charset = Charset(charset)

R. David Murray

2011-01-05 01:39:32 +0000

[diff] [blame]

276

if not isinstance(s, str):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

277

input_charset = charset.input_codec or 'us-ascii'

R David Murray

e5e366c

2011-06-18 12:57:28 -0400

[diff] [blame]

278

if input_charset == _charset.UNKNOWN8BIT:

279

s = s.decode('us-ascii', 'surrogateescape')

280

else:

281

s = s.decode(input_charset, errors)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

282

# Ensure that the bytes we're storing can be decoded to the output

Andrew Svetlov

737fb89

2012-12-18 21:14:22 +0200

[diff] [blame^]

283

# character set, otherwise an early error is raised.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

284

output_charset = charset.output_codec or 'us-ascii'

R. David Murray

9253214

2011-01-07 23:25:30 +0000

[diff] [blame]

285

if output_charset != _charset.UNKNOWN8BIT:

R David Murray

7441a7a

2012-03-14 02:59:51 -0400

[diff] [blame]

286

try:

287

s.encode(output_charset, errors)

288

except UnicodeEncodeError:

289

if output_charset!='us-ascii':

290

raise

291

charset = UTF8

R. David Murray

2011-01-05 01:39:32 +0000

[diff] [blame]

292

self._chunks.append((s, charset))

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

293

R. David Murray

2010-10-23 22:19:56 +0000

[diff] [blame]

294

def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):

R David Murray

cd37dfc

2011-03-14 18:35:56 -0400

[diff] [blame]

295

r"""Encode a message header into an RFC-compliant format.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

296

297

There are many issues involved in converting a given string for use in

298

an email header. Only certain character sets are readable in most

299

email clients, and as header strings can only contain a subset of

300

7-bit ASCII, care must be taken to properly convert and encode (with

301

Base64 or quoted-printable) header strings. In addition, there is a

302

75-character length limit on any given encoded header field, so

303

line-wrapping must be performed, even with double-byte character sets.

304

Ezio Melotti

ce073cd

2011-04-13 16:43:21 +0300

[diff] [blame]

305

Optional maxlinelen specifies the maximum length of each generated

R David Murray

308f14a

2011-04-12 15:00:44 -0400

[diff] [blame]

306

line, exclusive of the linesep string. Individual lines may be longer

307

than maxlinelen if a folding point cannot be found. The first line

308

will be shorter by the length of the header name plus ": " if a header

309

name was specified at Header construction time. The default value for

310

maxlinelen is determined at header construction time.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

311

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

312

Optional splitchars is a string containing characters which should be

313

given extra weight by the splitting algorithm during normal header

314

wrapping. This is in very rough support of RFC 2822's `higher level

315

syntactic breaks': split points preceded by a splitchar are preferred

316

during line splitting, with the characters preferred in the order in

317

which they appear in the string. Space and tab may be included in the

318

string to indicate whether preference should be given to one over the

319

other as a split point when other split chars do not appear in the line

320

being split. Splitchars does not affect RFC 2047 encoded lines.

R. David Murray

2010-10-23 22:19:56 +0000

[diff] [blame]

321

322

Optional linesep is a string to be used to separate the lines of

323

the value. The default value is the most useful for typical

324

Python applications, but it can be set to \r\n to produce RFC-compliant

325

line separators when needed.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

326

"""

327

self._normalize()

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

328

if maxlinelen is None:

329

maxlinelen = self._maxlinelen

330

# A maxlinelen of 0 means don't wrap. For all practical purposes,

331

# choosing a huge number here accomplishes that and makes the

332

# _ValueFormatter algorithm much simpler.

333

if maxlinelen == 0:

334

maxlinelen = 1000000

335

formatter = _ValueFormatter(self._headerlen, maxlinelen,

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

336

self._continuation_ws, splitchars)

337

for string, charset in self._chunks:

338

lines = string.splitlines()

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

339

if lines:

340

formatter.feed('', lines[0], charset)

341

else:

342

formatter.feed('', '', charset)

R. David Murray

6f0022d

2011-01-07 21:57:25 +0000

[diff] [blame]

343

for line in lines[1:]:

344

formatter.newline()

345

if charset.header_encoding is not None:

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

346

formatter.feed(self._continuation_ws, ' ' + line.lstrip(),

347

charset)

348

else:

349

sline = line.lstrip()

350

fws = line[:len(line)-len(sline)]

351

formatter.feed(fws, sline, charset)

R. David Murray

6f0022d

2011-01-07 21:57:25 +0000

[diff] [blame]

352

if len(lines) > 1:

353

formatter.newline()

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

354

formatter.add_transition()

R. David Murray

5b2d9dd

2011-01-09 02:35:24 +0000

[diff] [blame]

355

value = formatter._str(linesep)

356

if _embeded_header.search(value):

357

raise HeaderParseError("header value appears to contain "

358

"an embedded header: {!r}".format(value))

359

return value

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

360

361

def _normalize(self):

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

362

# Step 1: Normalize the chunks so that all runs of identical charsets

363

# get collapsed into a single unicode string.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

chunks = []

last_charset = None

last_chunk = []

for string, charset in self._chunks:

368

if charset == last_charset:

369

last_chunk.append(string)

370

else:

371

if last_charset is not None:

372

chunks.append((SPACE.join(last_chunk), last_charset))

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

373

last_chunk = [string]

374

last_charset = charset

375

if last_chunk:

376

chunks.append((SPACE.join(last_chunk), last_charset))

377

self._chunks = chunks

class _ValueFormatter:

382

def __init__(self, headerlen, maxlen, continuation_ws, splitchars):

383

self._maxlen = maxlen

384

self._continuation_ws = continuation_ws

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

385

self._continuation_ws_len = len(continuation_ws)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

386

self._splitchars = splitchars

387

self._lines = []

388

self._current_line = _Accumulator(headerlen)

389

R. David Murray

2010-10-23 22:19:56 +0000

[diff] [blame]

390

def _str(self, linesep):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

391

self.newline()

R. David Murray

2010-10-23 22:19:56 +0000

[diff] [blame]

392

return linesep.join(self._lines)

393

394

def __str__(self):

395

return self._str(NL)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

396

397

def newline(self):

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

398

end_of_line = self._current_line.pop()

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

399

if end_of_line != (' ', ''):

400

self._current_line.push(*end_of_line)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

401

if len(self._current_line) > 0:

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

402

if self._current_line.is_onlyws():

403

self._lines[-1] += str(self._current_line)

404

else:

405

self._lines.append(str(self._current_line))

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

406

self._current_line.reset()

407

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

408

def add_transition(self):

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

409

self._current_line.push(' ', '')

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

410

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

411

def feed(self, fws, string, charset):

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

412

# If the charset has no header encoding (i.e. it is an ASCII encoding)

413

# then we must split the header at the "highest level syntactic break"

414

# possible. Note that we don't have a lot of smarts about field

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

415

# syntax; we just try to break on semi-colons, then commas, then

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

416

# whitespace. Eventually, this should be pluggable.

417

if charset.header_encoding is None:

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

418

self._ascii_split(fws, string, self._splitchars)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

419

return

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

420

# Otherwise, we're doing either a Base64 or a quoted-printable

421

# encoding which means we don't need to split the line on syntactic

422

# breaks. We can basically just find enough characters to fit on the

423

# current line, minus the RFC 2047 chrome. What makes this trickier

424

# though is that we have to split at octet boundaries, not character

425

# boundaries but it's only safe to split at character boundaries so at

426

# best we can only get close.

427

encoded_lines = charset.header_encode_lines(string, self._maxlengths())

428

# The first element extends the current line, but if it's None then

429

# nothing more fit on the current line so start a new line.

430

try:

431

first_line = encoded_lines.pop(0)

432

except IndexError:

433

# There are no encoded lines, so we're done.

434

return

435

if first_line is not None:

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

436

self._append_chunk(fws, first_line)

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

437

try:

438

last_line = encoded_lines.pop()

439

except IndexError:

440

# There was only one line.

441

return

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

442

self.newline()

443

self._current_line.push(self._continuation_ws, last_line)

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

444

# Everything else are full lines in themselves.

445

for line in encoded_lines:

446

self._lines.append(self._continuation_ws + line)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

447

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

448

def _maxlengths(self):

449

# The first line's length.

450

yield self._maxlen - len(self._current_line)

451

while True:

452

yield self._maxlen - self._continuation_ws_len

453

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

454

def _ascii_split(self, fws, string, splitchars):

455

# The RFC 2822 header folding algorithm is simple in principle but

456

# complex in practice. Lines may be folded any place where "folding

457

# white space" appears by inserting a linesep character in front of the

458

# FWS. The complication is that not all spaces or tabs qualify as FWS,

459

# and we are also supposed to prefer to break at "higher level

460

# syntactic breaks". We can't do either of these without intimate

461

# knowledge of the structure of structured headers, which we don't have

462

# here. So the best we can do here is prefer to break at the specified

463

# splitchars, and hope that we don't choose any spaces or tabs that

464

# aren't legal FWS. (This is at least better than the old algorithm,

465

# where we would sometimes *introduce* FWS after a splitchar, or the

466

# algorithm before that, where we would turn all white space runs into

467

# single spaces or tabs.)

468

parts = re.split("(["+FWS+"]+)", fws+string)

if parts[0]:

parts[:0] = ['']

else:

parts.pop(0)

for fws, part in zip(*[iter(parts)]*2):

474

self._append_chunk(fws, part)

475

476

def _append_chunk(self, fws, string):

477

self._current_line.push(fws, string)

478

if len(self._current_line) > self._maxlen:

479

# Find the best split point, working backward from the end.

480

# There might be none, on a long first line.

481

for ch in self._splitchars:

482

for i in range(self._current_line.part_count()-1, 0, -1):

483

if ch.isspace():

484

fws = self._current_line[i][0]

485

if fws and fws[0]==ch:

486

break

487

prevpart = self._current_line[i-1][1]

488

if prevpart and prevpart[-1]==ch:

489

break

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

490

else:

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

491

continue

492

break

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

493

else:

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

494

fws, part = self._current_line.pop()

495

if self._current_line._initial_size > 0:

496

# There will be a header, so leave it on a line by itself.

497

self.newline()

498

if not fws:

499

# We don't use continuation_ws here because the whitespace

500

# after a header should always be a space.

501

fws = ' '

502

self._current_line.push(fws, part)

503

return

504

remainder = self._current_line.pop_from(i)

505

self._lines.append(str(self._current_line))

506

self._current_line.reset(remainder)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

507

508

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

509

class _Accumulator(list):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

510

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

511

def __init__(self, initial_size=0):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

512

self._initial_size = initial_size

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

513

super().__init__()

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

514

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

515

def push(self, fws, string):

516

self.append((fws, string))

517

518

def pop_from(self, i=0):

519

popped = self[i:]

520

self[i:] = []

521

return popped

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

522

523

def pop(self):

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

524

if self.part_count()==0:

525

return ('', '')

526

return super().pop()

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

527

528

def __len__(self):

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

529

return sum((len(fws)+len(part) for fws, part in self),

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

530

self._initial_size)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

531

532

def __str__(self):

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

533

return EMPTYSTRING.join((EMPTYSTRING.join((fws, part))

534

for fws, part in self))

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

535

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

536

def reset(self, startval=None):

537

if startval is None:

538

startval = []

539

self[:] = startval

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

540

self._initial_size = 0

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

541

542

def is_onlyws(self):

R David Murray