Blame - Lib/email/header.py - platform/external/python/cpython2

2007-08-30 01:15:14 +0000

[diff] [blame]

1

2

# Author: Ben Gertzfield, Barry Warsaw

3

# Contact: email-sig@python.org

4

5

"""Header encoding and decoding functionality."""

__all__ = [

'Header',

'decode_header',

'make_header',

]

import re

import binascii

import email.quoprimime

17

import email.base64mime

18

19

from email.errors import HeaderParseError

R. David Murray

9253214

2011-01-07 23:25:30 +0000

[diff] [blame]

20

from email import charset as _charset

21

Charset = _charset.Charset

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

NL = '\n'

SPACE = ' '

BSPACE = b' '

SPACE8 = ' ' * 8

EMPTYSTRING = ''

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

28

MAXLINELEN = 78

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

29

FWS = ' \t'

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

30

31

USASCII = Charset('us-ascii')

32

UTF8 = Charset('utf-8')

33

34

# Match encoded-word strings in the form =?charset?q?Hello_World?=

35

ecre = re.compile(r'''

36

=\? # literal =?

37

(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset

38

\? # literal ?

39

(?P<encoding>[qb]) # either a "q" or a "b", case insensitive

40

\? # literal ?

41

(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string

42

\?= # literal ?=

43

(?=[ \t]|$) # whitespace or the end of the string

44

''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)

45

46

# Field name regexp, including trailing colon, but not separating whitespace,

47

# according to RFC 2822. Character range is from tilde to exclamation mark.

48

# For use with .match()

49

fcre = re.compile(r'[\041-\176]+:$')

50

Ezio Melotti

1392500

2011-03-16 11:05:33 +0200

[diff] [blame]

51

# Find a header embedded in a putative header value. Used to check for

R. David Murray

5b2d9dd

2011-01-09 02:35:24 +0000

[diff] [blame]

52

# header injection attack.

53

_embeded_header = re.compile(r'\n[^ \t]+:')

54

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

# Helpers

_max_append = email.quoprimime._max_append

def decode_header(header):

63

"""Decode a message header value without converting charset.

64

65

Returns a list of (string, charset) pairs containing each of the decoded

66

parts of the header. Charset is None for non-encoded parts of the header,

67

otherwise a lower-case string containing the name of the character set

68

specified in the encoded string.

69

R David Murray

041015c

2011-03-25 15:10:55 -0400

[diff] [blame]

70

header may be a string that may or may not contain RFC2047 encoded words,

71

or it may be a Header object.

72

Amaury Forgeot d'Arc

1c25de6

2009-07-12 16:43:19 +0000

[diff] [blame]

73

An email.errors.HeaderParseError may be raised when certain decoding error

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

74

occurs (e.g. a base64 decoding exception).

75

"""

R David Murray

6bdb176

2011-06-18 12:30:55 -0400

[diff] [blame^]

76

# If it is a Header object, we can just return the encoded chunks.

R David Murray

041015c

2011-03-25 15:10:55 -0400

[diff] [blame]

77

if hasattr(header, '_chunks'):

R David Murray

6bdb176

2011-06-18 12:30:55 -0400

[diff] [blame^]

78

return [(_charset._encode(string, str(charset)), str(charset))

79

for string, charset in header._chunks]

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

80

# If no encoding, just return the header with no charset.

81

if not ecre.search(header):

82

return [(header, None)]

83

# First step is to parse all the encoded parts into triplets of the form

84

# (encoded_string, encoding, charset). For unencoded strings, the last

85

# two parts will be None.

86

words = []

87

for line in header.splitlines():

88

parts = ecre.split(line)

89

while parts:

90

unencoded = parts.pop(0).strip()

91

if unencoded:

92

words.append((unencoded, None, None))

93

if parts:

94

charset = parts.pop(0).lower()

95

encoding = parts.pop(0).lower()

96

encoded = parts.pop(0)

97

words.append((encoded, encoding, charset))

98

# The next step is to decode each encoded word by applying the reverse

99

# base64 or quopri transformation. decoded_words is now a list of the

100

# form (decoded_word, charset).

101

decoded_words = []

102

for encoded_string, encoding, charset in words:

103

if encoding is None:

104

# This is an unencoded word.

105

decoded_words.append((encoded_string, charset))

106

elif encoding == 'q':

107

word = email.quoprimime.header_decode(encoded_string)

108

decoded_words.append((word, charset))

109

elif encoding == 'b':

R. David Murray

c4e69cc

2010-08-03 22:14:10 +0000

[diff] [blame]

110

paderr = len(encoded_string) % 4 # Postel's law: add missing padding

111

if paderr:

112

encoded_string += '==='[:4 - paderr]

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

113

try:

114

word = email.base64mime.decode(encoded_string)

115

except binascii.Error:

116

raise HeaderParseError('Base64 decoding error')

117

else:

118

decoded_words.append((word, charset))

119

else:

120

raise AssertionError('Unexpected encoding: ' + encoding)

121

# Now convert all words to bytes and collapse consecutive runs of

122

# similarly encoded words.

123

collapsed = []

124

last_word = last_charset = None

125

for word, charset in decoded_words:

126

if isinstance(word, str):

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

127

word = bytes(word, 'raw-unicode-escape')

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

128

if last_word is None:

129

last_word = word

130

last_charset = charset

131

elif charset != last_charset:

132

collapsed.append((last_word, last_charset))

133

last_word = word

134

last_charset = charset

135

elif last_charset is None:

136

last_word += BSPACE + word

137

else:

138

last_word += word

139

collapsed.append((last_word, last_charset))

return collapsed

def make_header(decoded_seq, maxlinelen=None, header_name=None,

145

continuation_ws=' '):

146

"""Create a Header from a sequence of pairs as returned by decode_header()

147

148

decode_header() takes a header value string and returns a sequence of

149

pairs of the format (decoded_string, charset) where charset is the string

150

name of the character set.

151

152

This function takes one of those sequence of pairs and returns a Header

153

instance. Optional maxlinelen, header_name, and continuation_ws are as in

154

the Header constructor.

155

"""

156

h = Header(maxlinelen=maxlinelen, header_name=header_name,

157

continuation_ws=continuation_ws)

158

for s, charset in decoded_seq:

159

# None means us-ascii but we can simply pass it on to h.append()

160

if charset is not None and not isinstance(charset, Charset):

161

charset = Charset(charset)

h.append(s, charset)

return h

class Header:

def __init__(self, s=None, charset=None,

169

maxlinelen=None, header_name=None,

170

continuation_ws=' ', errors='strict'):

171

"""Create a MIME-compliant header that can contain many character sets.

172

173

Optional s is the initial header value. If None, the initial header

174

value is not set. You can later append to the header with .append()

175

method calls. s may be a byte string or a Unicode string, but see the

176

.append() documentation for semantics.

177

178

Optional charset serves two purposes: it has the same meaning as the

179

charset argument to the .append() method. It also sets the default

180

character set for all subsequent .append() calls that omit the charset

181

argument. If charset is not provided in the constructor, the us-ascii

182

charset is used both as s's initial charset and as the default for

183

subsequent .append() calls.

184

R. David Murray

4c1da4c

2010-12-29 16:57:24 +0000

[diff] [blame]

185

The maximum line length can be specified explicitly via maxlinelen. For

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

186

splitting the first line to a shorter value (to account for the field

187

header which isn't included in s, e.g. `Subject') pass in the name of

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

188

the field in header_name. The default maxlinelen is 78 as recommended

189

by RFC 2822.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

190

191

continuation_ws must be RFC 2822 compliant folding whitespace (usually

192

either a space or a hard tab) which will be prepended to continuation

193

lines.

194

195

errors is passed through to the .append() call.

"""

if charset is None:

charset = USASCII

elif not isinstance(charset, Charset):

200

charset = Charset(charset)

201

self._charset = charset

202

self._continuation_ws = continuation_ws

203

self._chunks = []

204

if s is not None:

205

self.append(s, charset, errors)

206

if maxlinelen is None:

207

maxlinelen = MAXLINELEN

208

self._maxlinelen = maxlinelen

209

if header_name is None:

210

self._headerlen = 0

211

else:

212

# Take the separating colon and space into account.

213

self._headerlen = len(header_name) + 2

214

215

def __str__(self):

216

"""Return the string value of the header."""

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

217

self._normalize()

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

218

uchunks = []

219

lastcs = None

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

220

for string, charset in self._chunks:

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

221

# We must preserve spaces between encoded and non-encoded word

222

# boundaries, which means for us we need to add a space when we go

223

# from a charset to None/us-ascii, or from None/us-ascii to a

224

# charset. Only do this for the second and subsequent chunks.

225

nextcs = charset

R. David Murray

9253214

2011-01-07 23:25:30 +0000

[diff] [blame]

226

if nextcs == _charset.UNKNOWN8BIT:

227

original_bytes = string.encode('ascii', 'surrogateescape')

228

string = original_bytes.decode('ascii', 'replace')

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

229

if uchunks:

230

if lastcs not in (None, 'us-ascii'):

231

if nextcs in (None, 'us-ascii'):

232

uchunks.append(SPACE)

233

nextcs = None

234

elif nextcs not in (None, 'us-ascii'):

235

uchunks.append(SPACE)

236

lastcs = nextcs

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

237

uchunks.append(string)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

238

return EMPTYSTRING.join(uchunks)

239

240

# Rich comparison operators for equality only. BAW: does it make sense to

241

# have or explicitly disable <, <=, >, >= operators?

242

def __eq__(self, other):

243

# other may be a Header or a string. Both are fine so coerce

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

244

# ourselves to a unicode (of the unencoded header value), swap the

245

# args and do another comparison.

246

return other == str(self)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

247

248

def __ne__(self, other):

249

return not self == other

250

251

def append(self, s, charset=None, errors='strict'):

252

"""Append a string to the MIME header.

253

254

Optional charset, if given, should be a Charset instance or the name

255

of a character set (which will be converted to a Charset instance). A

256

value of None (the default) means that the charset given in the

257

constructor is used.

258

259

s may be a byte string or a Unicode string. If it is a byte string

R. David Murray

4c1da4c

2010-12-29 16:57:24 +0000

[diff] [blame]

260

(i.e. isinstance(s, str) is false), then charset is the encoding of

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

261

that byte string, and a UnicodeError will be raised if the string

262

cannot be decoded with that charset. If s is a Unicode string, then

263

charset is a hint specifying the character set of the characters in

R. David Murray

2011-01-05 01:39:32 +0000

[diff] [blame]

264

the string. In either case, when producing an RFC 2822 compliant

265

header using RFC 2047 rules, the string will be encoded using the

266

output codec of the charset. If the string cannot be encoded to the

267

output codec, a UnicodeError will be raised.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

268

R. David Murray

2011-01-05 01:39:32 +0000

[diff] [blame]

269

Optional `errors' is passed as the errors argument to the decode

270

call if s is a byte string.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

271

"""

272

if charset is None:

273

charset = self._charset

274

elif not isinstance(charset, Charset):

275

charset = Charset(charset)

R. David Murray

2011-01-05 01:39:32 +0000

[diff] [blame]

276

if not isinstance(s, str):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

277

input_charset = charset.input_codec or 'us-ascii'

R. David Murray

2011-01-05 01:39:32 +0000

[diff] [blame]

278

s = s.decode(input_charset, errors)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

279

# Ensure that the bytes we're storing can be decoded to the output

280

# character set, otherwise an early error is thrown.

281

output_charset = charset.output_codec or 'us-ascii'

R. David Murray

9253214

2011-01-07 23:25:30 +0000

[diff] [blame]

282

if output_charset != _charset.UNKNOWN8BIT:

283

s.encode(output_charset, errors)

R. David Murray

2011-01-05 01:39:32 +0000

[diff] [blame]

284

self._chunks.append((s, charset))

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

285

R. David Murray

2010-10-23 22:19:56 +0000

[diff] [blame]

286

def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):

R David Murray

cd37dfc

2011-03-14 18:35:56 -0400

[diff] [blame]

287

r"""Encode a message header into an RFC-compliant format.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

288

289

There are many issues involved in converting a given string for use in

290

an email header. Only certain character sets are readable in most

291

email clients, and as header strings can only contain a subset of

292

7-bit ASCII, care must be taken to properly convert and encode (with

293

Base64 or quoted-printable) header strings. In addition, there is a

294

75-character length limit on any given encoded header field, so

295

line-wrapping must be performed, even with double-byte character sets.

296

Ezio Melotti

ce073cd

2011-04-13 16:43:21 +0300

[diff] [blame]

297

Optional maxlinelen specifies the maximum length of each generated

R David Murray

308f14a

2011-04-12 15:00:44 -0400

[diff] [blame]

298

line, exclusive of the linesep string. Individual lines may be longer

299

than maxlinelen if a folding point cannot be found. The first line

300

will be shorter by the length of the header name plus ": " if a header

301

name was specified at Header construction time. The default value for

302

maxlinelen is determined at header construction time.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

303

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

304

Optional splitchars is a string containing characters which should be

305

given extra weight by the splitting algorithm during normal header

306

wrapping. This is in very rough support of RFC 2822's `higher level

307

syntactic breaks': split points preceded by a splitchar are preferred

308

during line splitting, with the characters preferred in the order in

309

which they appear in the string. Space and tab may be included in the

310

string to indicate whether preference should be given to one over the

311

other as a split point when other split chars do not appear in the line

312

being split. Splitchars does not affect RFC 2047 encoded lines.

R. David Murray

2010-10-23 22:19:56 +0000

[diff] [blame]

313

314

Optional linesep is a string to be used to separate the lines of

315

the value. The default value is the most useful for typical

316

Python applications, but it can be set to \r\n to produce RFC-compliant

317

line separators when needed.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

318

"""

319

self._normalize()

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

320

if maxlinelen is None:

321

maxlinelen = self._maxlinelen

322

# A maxlinelen of 0 means don't wrap. For all practical purposes,

323

# choosing a huge number here accomplishes that and makes the

324

# _ValueFormatter algorithm much simpler.

325

if maxlinelen == 0:

326

maxlinelen = 1000000

327

formatter = _ValueFormatter(self._headerlen, maxlinelen,

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

328

self._continuation_ws, splitchars)

329

for string, charset in self._chunks:

330

lines = string.splitlines()

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

331

if lines:

332

formatter.feed('', lines[0], charset)

333

else:

334

formatter.feed('', '', charset)

R. David Murray

6f0022d

2011-01-07 21:57:25 +0000

[diff] [blame]

335

for line in lines[1:]:

336

formatter.newline()

337

if charset.header_encoding is not None:

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

338

formatter.feed(self._continuation_ws, ' ' + line.lstrip(),

339

charset)

340

else:

341

sline = line.lstrip()

342

fws = line[:len(line)-len(sline)]

343

formatter.feed(fws, sline, charset)

R. David Murray

6f0022d

2011-01-07 21:57:25 +0000

[diff] [blame]

344

if len(lines) > 1:

345

formatter.newline()

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

346

formatter.add_transition()

R. David Murray

5b2d9dd

2011-01-09 02:35:24 +0000

[diff] [blame]

347

value = formatter._str(linesep)

348

if _embeded_header.search(value):

349

raise HeaderParseError("header value appears to contain "

350

"an embedded header: {!r}".format(value))

351

return value

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

352

353

def _normalize(self):

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

354

# Step 1: Normalize the chunks so that all runs of identical charsets

355

# get collapsed into a single unicode string.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

chunks = []

last_charset = None

last_chunk = []

for string, charset in self._chunks:

360

if charset == last_charset:

361

last_chunk.append(string)

362

else:

363

if last_charset is not None:

364

chunks.append((SPACE.join(last_chunk), last_charset))

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

365

last_chunk = [string]

366

last_charset = charset

367

if last_chunk:

368

chunks.append((SPACE.join(last_chunk), last_charset))

369

self._chunks = chunks

class _ValueFormatter:

374

def __init__(self, headerlen, maxlen, continuation_ws, splitchars):

375

self._maxlen = maxlen

376

self._continuation_ws = continuation_ws

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

377

self._continuation_ws_len = len(continuation_ws)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

378

self._splitchars = splitchars

379

self._lines = []

380

self._current_line = _Accumulator(headerlen)

381

R. David Murray

2010-10-23 22:19:56 +0000

[diff] [blame]

382

def _str(self, linesep):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

383

self.newline()

R. David Murray

2010-10-23 22:19:56 +0000

[diff] [blame]

384

return linesep.join(self._lines)

385

386

def __str__(self):

387

return self._str(NL)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

388

389

def newline(self):

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

390

end_of_line = self._current_line.pop()

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

391

if end_of_line != (' ', ''):

392

self._current_line.push(*end_of_line)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

393

if len(self._current_line) > 0:

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

394

if self._current_line.is_onlyws():

395

self._lines[-1] += str(self._current_line)

396

else:

397

self._lines.append(str(self._current_line))

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

398

self._current_line.reset()

399

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

400

def add_transition(self):

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

401

self._current_line.push(' ', '')

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

402

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

403

def feed(self, fws, string, charset):

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

404

# If the charset has no header encoding (i.e. it is an ASCII encoding)

405

# then we must split the header at the "highest level syntactic break"

406

# possible. Note that we don't have a lot of smarts about field

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

407

# syntax; we just try to break on semi-colons, then commas, then

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

408

# whitespace. Eventually, this should be pluggable.

409

if charset.header_encoding is None:

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

410

self._ascii_split(fws, string, self._splitchars)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

411

return

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

412

# Otherwise, we're doing either a Base64 or a quoted-printable

413

# encoding which means we don't need to split the line on syntactic

414

# breaks. We can basically just find enough characters to fit on the

415

# current line, minus the RFC 2047 chrome. What makes this trickier

416

# though is that we have to split at octet boundaries, not character

417

# boundaries but it's only safe to split at character boundaries so at

418

# best we can only get close.

419

encoded_lines = charset.header_encode_lines(string, self._maxlengths())

420

# The first element extends the current line, but if it's None then

421

# nothing more fit on the current line so start a new line.

422

try:

423

first_line = encoded_lines.pop(0)

424

except IndexError:

425

# There are no encoded lines, so we're done.

426

return

427

if first_line is not None:

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

428

self._append_chunk(fws, first_line)

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

429

try:

430

last_line = encoded_lines.pop()

431

except IndexError:

432

# There was only one line.

433

return

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

434

self.newline()

435

self._current_line.push(self._continuation_ws, last_line)

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

436

# Everything else are full lines in themselves.

437

for line in encoded_lines:

438

self._lines.append(self._continuation_ws + line)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

439

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

440

def _maxlengths(self):

441

# The first line's length.

442

yield self._maxlen - len(self._current_line)

443

while True:

444

yield self._maxlen - self._continuation_ws_len

445

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

446

def _ascii_split(self, fws, string, splitchars):

447

# The RFC 2822 header folding algorithm is simple in principle but

448

# complex in practice. Lines may be folded any place where "folding

449

# white space" appears by inserting a linesep character in front of the

450

# FWS. The complication is that not all spaces or tabs qualify as FWS,

451

# and we are also supposed to prefer to break at "higher level

452

# syntactic breaks". We can't do either of these without intimate

453

# knowledge of the structure of structured headers, which we don't have

454

# here. So the best we can do here is prefer to break at the specified

455

# splitchars, and hope that we don't choose any spaces or tabs that

456

# aren't legal FWS. (This is at least better than the old algorithm,

457

# where we would sometimes *introduce* FWS after a splitchar, or the

458

# algorithm before that, where we would turn all white space runs into

459

# single spaces or tabs.)

460

parts = re.split("(["+FWS+"]+)", fws+string)

if parts[0]:

parts[:0] = ['']

else:

parts.pop(0)

for fws, part in zip(*[iter(parts)]*2):

466

self._append_chunk(fws, part)

467

468

def _append_chunk(self, fws, string):

469

self._current_line.push(fws, string)

470

if len(self._current_line) > self._maxlen:

471

# Find the best split point, working backward from the end.

472

# There might be none, on a long first line.

473

for ch in self._splitchars:

474

for i in range(self._current_line.part_count()-1, 0, -1):

475

if ch.isspace():

476

fws = self._current_line[i][0]

477

if fws and fws[0]==ch:

478

break

479

prevpart = self._current_line[i-1][1]

480

if prevpart and prevpart[-1]==ch:

481

break

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

482

else:

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

483

continue

484

break

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

485

else:

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

486

fws, part = self._current_line.pop()

487

if self._current_line._initial_size > 0:

488

# There will be a header, so leave it on a line by itself.

489

self.newline()

490

if not fws:

491

# We don't use continuation_ws here because the whitespace

492

# after a header should always be a space.

493

fws = ' '

494

self._current_line.push(fws, part)

495

return

496

remainder = self._current_line.pop_from(i)

497

self._lines.append(str(self._current_line))

498

self._current_line.reset(remainder)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

499

500

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

501

class _Accumulator(list):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

502

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

503

def __init__(self, initial_size=0):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

504

self._initial_size = initial_size

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

505

super().__init__()

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

506

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

507

def push(self, fws, string):

508

self.append((fws, string))

509

510

def pop_from(self, i=0):

511

popped = self[i:]

512

self[i:] = []

513

return popped

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

514

515

def pop(self):

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

516

if self.part_count()==0:

517

return ('', '')

518

return super().pop()

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

519

520

def __len__(self):

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

521

return sum((len(fws)+len(part) for fws, part in self),

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

522

self._initial_size)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

523

524

def __str__(self):

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

525

return EMPTYSTRING.join((EMPTYSTRING.join((fws, part))

526

for fws, part in self))

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

527

R David Murray

2011-04-18 10:04:34 -0400

[diff] [blame]

528

def reset(self, startval=None):

529

if startval is None:

530

startval = []

531

self[:] = startval

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

532

self._initial_size = 0

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

533

534

def is_onlyws(self):

R David Murray