Blame - Lib/email/header.py - platform/external/python/cpython2

2007-08-30 01:15:14 +0000

[diff] [blame]

1

2

# Author: Ben Gertzfield, Barry Warsaw

3

# Contact: email-sig@python.org

4

5

"""Header encoding and decoding functionality."""

__all__ = [

'Header',

'decode_header',

'make_header',

]

import re

import binascii

import email.quoprimime

17

import email.base64mime

18

19

from email.errors import HeaderParseError

R. David Murray

9253214

2011-01-07 23:25:30 +0000

[diff] [blame]

20

from email import charset as _charset

21

Charset = _charset.Charset

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

NL = '\n'

SPACE = ' '

BSPACE = b' '

SPACE8 = ' ' * 8

EMPTYSTRING = ''

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

28

MAXLINELEN = 78

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

29

30

USASCII = Charset('us-ascii')

31

UTF8 = Charset('utf-8')

32

33

# Match encoded-word strings in the form =?charset?q?Hello_World?=

34

ecre = re.compile(r'''

35

=\? # literal =?

36

(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset

37

\? # literal ?

38

(?P<encoding>[qb]) # either a "q" or a "b", case insensitive

39

\? # literal ?

40

(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string

41

\?= # literal ?=

42

(?=[ \t]|$) # whitespace or the end of the string

43

''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)

44

45

# Field name regexp, including trailing colon, but not separating whitespace,

46

# according to RFC 2822. Character range is from tilde to exclamation mark.

47

# For use with .match()

48

fcre = re.compile(r'[\041-\176]+:$')

49

Ezio Melotti

1392500

2011-03-16 11:05:33 +0200

[diff] [blame]

50

# Find a header embedded in a putative header value. Used to check for

R. David Murray

5b2d9dd

2011-01-09 02:35:24 +0000

[diff] [blame]

51

# header injection attack.

52

_embeded_header = re.compile(r'\n[^ \t]+:')

53

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

# Helpers

_max_append = email.quoprimime._max_append

def decode_header(header):

62

"""Decode a message header value without converting charset.

63

64

Returns a list of (string, charset) pairs containing each of the decoded

65

parts of the header. Charset is None for non-encoded parts of the header,

66

otherwise a lower-case string containing the name of the character set

67

specified in the encoded string.

68

R David Murray

041015c

2011-03-25 15:10:55 -0400

[diff] [blame^]

69

header may be a string that may or may not contain RFC2047 encoded words,

70

or it may be a Header object.

71

Amaury Forgeot d'Arc

1c25de6

2009-07-12 16:43:19 +0000

[diff] [blame]

72

An email.errors.HeaderParseError may be raised when certain decoding error

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

73

occurs (e.g. a base64 decoding exception).

74

"""

R David Murray

041015c

2011-03-25 15:10:55 -0400

[diff] [blame^]

75

# If it is a Header object, we can just return the chunks.

76

if hasattr(header, '_chunks'):

77

return list(header._chunks)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

78

# If no encoding, just return the header with no charset.

79

if not ecre.search(header):

80

return [(header, None)]

81

# First step is to parse all the encoded parts into triplets of the form

82

# (encoded_string, encoding, charset). For unencoded strings, the last

83

# two parts will be None.

84

words = []

85

for line in header.splitlines():

86

parts = ecre.split(line)

87

while parts:

88

unencoded = parts.pop(0).strip()

89

if unencoded:

90

words.append((unencoded, None, None))

91

if parts:

92

charset = parts.pop(0).lower()

93

encoding = parts.pop(0).lower()

94

encoded = parts.pop(0)

95

words.append((encoded, encoding, charset))

96

# The next step is to decode each encoded word by applying the reverse

97

# base64 or quopri transformation. decoded_words is now a list of the

98

# form (decoded_word, charset).

99

decoded_words = []

100

for encoded_string, encoding, charset in words:

101

if encoding is None:

102

# This is an unencoded word.

103

decoded_words.append((encoded_string, charset))

104

elif encoding == 'q':

105

word = email.quoprimime.header_decode(encoded_string)

106

decoded_words.append((word, charset))

107

elif encoding == 'b':

R. David Murray

c4e69cc

2010-08-03 22:14:10 +0000

[diff] [blame]

108

paderr = len(encoded_string) % 4 # Postel's law: add missing padding

109

if paderr:

110

encoded_string += '==='[:4 - paderr]

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

111

try:

112

word = email.base64mime.decode(encoded_string)

113

except binascii.Error:

114

raise HeaderParseError('Base64 decoding error')

115

else:

116

decoded_words.append((word, charset))

117

else:

118

raise AssertionError('Unexpected encoding: ' + encoding)

119

# Now convert all words to bytes and collapse consecutive runs of

120

# similarly encoded words.

121

collapsed = []

122

last_word = last_charset = None

123

for word, charset in decoded_words:

124

if isinstance(word, str):

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

125

word = bytes(word, 'raw-unicode-escape')

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

126

if last_word is None:

127

last_word = word

128

last_charset = charset

129

elif charset != last_charset:

130

collapsed.append((last_word, last_charset))

131

last_word = word

132

last_charset = charset

133

elif last_charset is None:

134

last_word += BSPACE + word

135

else:

136

last_word += word

137

collapsed.append((last_word, last_charset))

return collapsed

def make_header(decoded_seq, maxlinelen=None, header_name=None,

143

continuation_ws=' '):

144

"""Create a Header from a sequence of pairs as returned by decode_header()

145

146

decode_header() takes a header value string and returns a sequence of

147

pairs of the format (decoded_string, charset) where charset is the string

148

name of the character set.

149

150

This function takes one of those sequence of pairs and returns a Header

151

instance. Optional maxlinelen, header_name, and continuation_ws are as in

152

the Header constructor.

153

"""

154

h = Header(maxlinelen=maxlinelen, header_name=header_name,

155

continuation_ws=continuation_ws)

156

for s, charset in decoded_seq:

157

# None means us-ascii but we can simply pass it on to h.append()

158

if charset is not None and not isinstance(charset, Charset):

159

charset = Charset(charset)

h.append(s, charset)

return h

class Header:

def __init__(self, s=None, charset=None,

167

maxlinelen=None, header_name=None,

168

continuation_ws=' ', errors='strict'):

169

"""Create a MIME-compliant header that can contain many character sets.

170

171

Optional s is the initial header value. If None, the initial header

172

value is not set. You can later append to the header with .append()

173

method calls. s may be a byte string or a Unicode string, but see the

174

.append() documentation for semantics.

175

176

Optional charset serves two purposes: it has the same meaning as the

177

charset argument to the .append() method. It also sets the default

178

character set for all subsequent .append() calls that omit the charset

179

argument. If charset is not provided in the constructor, the us-ascii

180

charset is used both as s's initial charset and as the default for

181

subsequent .append() calls.

182

R. David Murray

4c1da4c

2010-12-29 16:57:24 +0000

[diff] [blame]

183

The maximum line length can be specified explicitly via maxlinelen. For

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

184

splitting the first line to a shorter value (to account for the field

185

header which isn't included in s, e.g. `Subject') pass in the name of

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

186

the field in header_name. The default maxlinelen is 78 as recommended

187

by RFC 2822.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

188

189

continuation_ws must be RFC 2822 compliant folding whitespace (usually

190

either a space or a hard tab) which will be prepended to continuation

191

lines.

192

193

errors is passed through to the .append() call.

"""

if charset is None:

charset = USASCII

elif not isinstance(charset, Charset):

198

charset = Charset(charset)

199

self._charset = charset

200

self._continuation_ws = continuation_ws

201

self._chunks = []

202

if s is not None:

203

self.append(s, charset, errors)

204

if maxlinelen is None:

205

maxlinelen = MAXLINELEN

206

self._maxlinelen = maxlinelen

207

if header_name is None:

208

self._headerlen = 0

209

else:

210

# Take the separating colon and space into account.

211

self._headerlen = len(header_name) + 2

212

213

def __str__(self):

214

"""Return the string value of the header."""

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

215

self._normalize()

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

216

uchunks = []

217

lastcs = None

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

218

for string, charset in self._chunks:

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

219

# We must preserve spaces between encoded and non-encoded word

220

# boundaries, which means for us we need to add a space when we go

221

# from a charset to None/us-ascii, or from None/us-ascii to a

222

# charset. Only do this for the second and subsequent chunks.

223

nextcs = charset

R. David Murray

9253214

2011-01-07 23:25:30 +0000

[diff] [blame]

224

if nextcs == _charset.UNKNOWN8BIT:

225

original_bytes = string.encode('ascii', 'surrogateescape')

226

string = original_bytes.decode('ascii', 'replace')

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

227

if uchunks:

228

if lastcs not in (None, 'us-ascii'):

229

if nextcs in (None, 'us-ascii'):

230

uchunks.append(SPACE)

231

nextcs = None

232

elif nextcs not in (None, 'us-ascii'):

233

uchunks.append(SPACE)

234

lastcs = nextcs

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

235

uchunks.append(string)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

236

return EMPTYSTRING.join(uchunks)

237

238

# Rich comparison operators for equality only. BAW: does it make sense to

239

# have or explicitly disable <, <=, >, >= operators?

240

def __eq__(self, other):

241

# other may be a Header or a string. Both are fine so coerce

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

242

# ourselves to a unicode (of the unencoded header value), swap the

243

# args and do another comparison.

244

return other == str(self)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

245

246

def __ne__(self, other):

247

return not self == other

248

249

def append(self, s, charset=None, errors='strict'):

250

"""Append a string to the MIME header.

251

252

Optional charset, if given, should be a Charset instance or the name

253

of a character set (which will be converted to a Charset instance). A

254

value of None (the default) means that the charset given in the

255

constructor is used.

256

257

s may be a byte string or a Unicode string. If it is a byte string

R. David Murray

4c1da4c

2010-12-29 16:57:24 +0000

[diff] [blame]

258

(i.e. isinstance(s, str) is false), then charset is the encoding of

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

259

that byte string, and a UnicodeError will be raised if the string

260

cannot be decoded with that charset. If s is a Unicode string, then

261

charset is a hint specifying the character set of the characters in

R. David Murray

2011-01-05 01:39:32 +0000

[diff] [blame]

262

the string. In either case, when producing an RFC 2822 compliant

263

header using RFC 2047 rules, the string will be encoded using the

264

output codec of the charset. If the string cannot be encoded to the

265

output codec, a UnicodeError will be raised.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

266

R. David Murray

2011-01-05 01:39:32 +0000

[diff] [blame]

267

Optional `errors' is passed as the errors argument to the decode

268

call if s is a byte string.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

269

"""

270

if charset is None:

271

charset = self._charset

272

elif not isinstance(charset, Charset):

273

charset = Charset(charset)

R. David Murray

2011-01-05 01:39:32 +0000

[diff] [blame]

274

if not isinstance(s, str):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

275

input_charset = charset.input_codec or 'us-ascii'

R. David Murray

2011-01-05 01:39:32 +0000

[diff] [blame]

276

s = s.decode(input_charset, errors)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

277

# Ensure that the bytes we're storing can be decoded to the output

278

# character set, otherwise an early error is thrown.

279

output_charset = charset.output_codec or 'us-ascii'

R. David Murray

9253214

2011-01-07 23:25:30 +0000

[diff] [blame]

280

if output_charset != _charset.UNKNOWN8BIT:

281

s.encode(output_charset, errors)

R. David Murray

2011-01-05 01:39:32 +0000

[diff] [blame]

282

self._chunks.append((s, charset))

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

283

R. David Murray

2010-10-23 22:19:56 +0000

[diff] [blame]

284

def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):

R David Murray

cd37dfc

2011-03-14 18:35:56 -0400

[diff] [blame]

285

r"""Encode a message header into an RFC-compliant format.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

286

287

There are many issues involved in converting a given string for use in

288

an email header. Only certain character sets are readable in most

289

email clients, and as header strings can only contain a subset of

290

7-bit ASCII, care must be taken to properly convert and encode (with

291

Base64 or quoted-printable) header strings. In addition, there is a

292

75-character length limit on any given encoded header field, so

293

line-wrapping must be performed, even with double-byte character sets.

294

295

This method will do its best to convert the string to the correct

296

character set used in email, and encode and line wrap it safely with

297

the appropriate scheme for that character set.

298

299

If the given charset is not known or an error occurs during

300

conversion, this function will return the header untouched.

301

302

Optional splitchars is a string containing characters to split long

303

ASCII lines on, in rough support of RFC 2822's `highest level

304

syntactic breaks'. This doesn't affect RFC 2047 encoded lines.

R. David Murray

2010-10-23 22:19:56 +0000

[diff] [blame]

305

306

Optional linesep is a string to be used to separate the lines of

307

the value. The default value is the most useful for typical

308

Python applications, but it can be set to \r\n to produce RFC-compliant

309

line separators when needed.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

310

"""

311

self._normalize()

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

312

if maxlinelen is None:

313

maxlinelen = self._maxlinelen

314

# A maxlinelen of 0 means don't wrap. For all practical purposes,

315

# choosing a huge number here accomplishes that and makes the

316

# _ValueFormatter algorithm much simpler.

317

if maxlinelen == 0:

318

maxlinelen = 1000000

319

formatter = _ValueFormatter(self._headerlen, maxlinelen,

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

320

self._continuation_ws, splitchars)

321

for string, charset in self._chunks:

322

lines = string.splitlines()

R David Murray

de91276

2011-03-16 18:26:23 -0400

[diff] [blame]

323

formatter.feed(lines[0] if lines else '', charset)

R. David Murray

6f0022d

2011-01-07 21:57:25 +0000

[diff] [blame]

324

for line in lines[1:]:

325

formatter.newline()

326

if charset.header_encoding is not None:

327

formatter.feed(self._continuation_ws, USASCII)

328

line = ' ' + line.lstrip()

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

329

formatter.feed(line, charset)

R. David Murray

6f0022d

2011-01-07 21:57:25 +0000

[diff] [blame]

330

if len(lines) > 1:

331

formatter.newline()

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

332

formatter.add_transition()

R. David Murray

5b2d9dd

2011-01-09 02:35:24 +0000

[diff] [blame]

333

value = formatter._str(linesep)

334

if _embeded_header.search(value):

335

raise HeaderParseError("header value appears to contain "

336

"an embedded header: {!r}".format(value))

337

return value

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

338

339

def _normalize(self):

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

340

# Step 1: Normalize the chunks so that all runs of identical charsets

341

# get collapsed into a single unicode string.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

chunks = []

last_charset = None

last_chunk = []

for string, charset in self._chunks:

346

if charset == last_charset:

347

last_chunk.append(string)

348

else:

349

if last_charset is not None:

350

chunks.append((SPACE.join(last_chunk), last_charset))

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

351

last_chunk = [string]

352

last_charset = charset

353

if last_chunk:

354

chunks.append((SPACE.join(last_chunk), last_charset))

355

self._chunks = chunks

class _ValueFormatter:

360

def __init__(self, headerlen, maxlen, continuation_ws, splitchars):

361

self._maxlen = maxlen

362

self._continuation_ws = continuation_ws

363

self._continuation_ws_len = len(continuation_ws.replace('\t', SPACE8))

364

self._splitchars = splitchars

365

self._lines = []

366

self._current_line = _Accumulator(headerlen)

367

R. David Murray

2010-10-23 22:19:56 +0000

[diff] [blame]

368

def _str(self, linesep):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

369

self.newline()

R. David Murray

2010-10-23 22:19:56 +0000

[diff] [blame]

370

return linesep.join(self._lines)

371

372

def __str__(self):

373

return self._str(NL)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

374

375

def newline(self):

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

376

end_of_line = self._current_line.pop()

377

if end_of_line is not None:

378

self._current_line.push(end_of_line)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

379

if len(self._current_line) > 0:

380

self._lines.append(str(self._current_line))

381

self._current_line.reset()

382

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

383

def add_transition(self):

384

self._current_line.push(None)

385

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

386

def feed(self, string, charset):

387

# If the string itself fits on the current line in its encoded format,

388

# then add it now and be done with it.

389

encoded_string = charset.header_encode(string)

390

if len(encoded_string) + len(self._current_line) <= self._maxlen:

391

self._current_line.push(encoded_string)

392

return

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

393

# If the charset has no header encoding (i.e. it is an ASCII encoding)

394

# then we must split the header at the "highest level syntactic break"

395

# possible. Note that we don't have a lot of smarts about field

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

396

# syntax; we just try to break on semi-colons, then commas, then

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

397

# whitespace. Eventually, this should be pluggable.

398

if charset.header_encoding is None:

399

for ch in self._splitchars:

if ch in string:

break

else:

ch = None

# If there's no available split character then regardless of

405

# whether the string fits on the line, we have to put it on a line

406

# by itself.

407

if ch is None:

408

if not self._current_line.is_onlyws():

409

self._lines.append(str(self._current_line))

410

self._current_line.reset(self._continuation_ws)

411

self._current_line.push(encoded_string)

412

else:

413

self._ascii_split(string, ch)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

414

return

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

415

# Otherwise, we're doing either a Base64 or a quoted-printable

416

# encoding which means we don't need to split the line on syntactic

417

# breaks. We can basically just find enough characters to fit on the

418

# current line, minus the RFC 2047 chrome. What makes this trickier

419

# though is that we have to split at octet boundaries, not character

420

# boundaries but it's only safe to split at character boundaries so at

421

# best we can only get close.

422

encoded_lines = charset.header_encode_lines(string, self._maxlengths())

423

# The first element extends the current line, but if it's None then

424

# nothing more fit on the current line so start a new line.

425

try:

426

first_line = encoded_lines.pop(0)

427

except IndexError:

428

# There are no encoded lines, so we're done.

429

return

430

if first_line is not None:

431

self._current_line.push(first_line)

432

self._lines.append(str(self._current_line))

433

self._current_line.reset(self._continuation_ws)

434

try:

435

last_line = encoded_lines.pop()

436

except IndexError:

437

# There was only one line.

438

return

439

self._current_line.push(last_line)

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

440

# Everything else are full lines in themselves.

441

for line in encoded_lines:

442

self._lines.append(self._continuation_ws + line)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

443

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

444

def _maxlengths(self):

445

# The first line's length.

446

yield self._maxlen - len(self._current_line)

447

while True:

448

yield self._maxlen - self._continuation_ws_len

449

450

def _ascii_split(self, string, ch):

451

holding = _Accumulator()

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

452

# Split the line on the split character, preserving it. If the split

453

# character is whitespace RFC 2822 $2.2.3 requires us to fold on the

454

# whitespace, so that the line leads with the original whitespace we

455

# split on. However, if a higher syntactic break is used instead

456

# (e.g. comma or semicolon), the folding should happen after the split

457

# character. But then in that case, we need to add our own

458

# continuation whitespace -- although won't that break unfolding?

459

for part, splitpart, nextpart in _spliterator(ch, string):

460

if not splitpart:

461

# No splitpart means this is the last chunk. Put this part

462

# either on the current line or the next line depending on

463

# whether it fits.

464

holding.push(part)

465

if len(holding) + len(self._current_line) <= self._maxlen:

466

# It fits, but we're done.

467

self._current_line.push(str(holding))

468

else:

469

# It doesn't fit, but we're done. Before pushing a new

470

# line, watch out for the current line containing only

471

# whitespace.

472

holding.pop()

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

473

if self._current_line.is_onlyws() and holding.is_onlyws():

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

474

# Don't start a new line.

475

holding.push(part)

476

part = None

477

self._current_line.push(str(holding))

478

self._lines.append(str(self._current_line))

479

if part is None:

480

self._current_line.reset()

481

else:

482

holding.reset(part)

483

self._current_line.reset(str(holding))

484

return

485

elif not nextpart:

486

# There must be some trailing split characters because we

487

# found a split character but no next part. In this case we

488

# must treat the thing to fit as the part + splitpart because

489

# if splitpart is whitespace it's not allowed to be the only

490

# thing on the line, and if it's not whitespace we must split

491

# after the syntactic break. In either case, we're done.

492

holding_prelen = len(holding)

493

holding.push(part + splitpart)

494

if len(holding) + len(self._current_line) <= self._maxlen:

495

self._current_line.push(str(holding))

496

elif holding_prelen == 0:

497

# This is the only chunk left so it has to go on the

498

# current line.

499

self._current_line.push(str(holding))

500

else:

501

save_part = holding.pop()

502

self._current_line.push(str(holding))

503

self._lines.append(str(self._current_line))

504

holding.reset(save_part)

505

self._current_line.reset(str(holding))

506

return

507

elif not part:

508

# We're leading with a split character. See if the splitpart

509

# and nextpart fits on the current line.

510

holding.push(splitpart + nextpart)

511

holding_len = len(holding)

512

# We know we're not leaving the nextpart on the stack.

513

holding.pop()

514

if holding_len + len(self._current_line) <= self._maxlen:

515

holding.push(splitpart)

516

else:

517

# It doesn't fit. Since there's no current part really

518

# the best we can do is start a new line and push the

519

# split part onto it.

520

self._current_line.push(str(holding))

521

holding.reset()

522

if len(self._current_line) > 0 and self._lines:

523

self._lines.append(str(self._current_line))

524

self._current_line.reset()

525

holding.push(splitpart)

526

else:

527

# All three parts are present. First let's see if all three

528

# parts will fit on the current line. If so, we don't need to

529

# split it.

530

holding.push(part + splitpart + nextpart)

531

holding_len = len(holding)

532

# Pop the part because we'll push nextpart on the next

533

# iteration through the loop.

534

holding.pop()

535

if holding_len + len(self._current_line) <= self._maxlen:

536

holding.push(part + splitpart)

537

else:

538

# The entire thing doesn't fit. See if we need to split

539

# before or after the split characters.

540

if splitpart.isspace():

541

# Split before whitespace. Remember that the

542

# whitespace becomes the continuation whitespace of

543

# the next line so it goes to current_line not holding.

544

holding.push(part)

545

self._current_line.push(str(holding))

546

holding.reset()

547

self._lines.append(str(self._current_line))

548

self._current_line.reset(splitpart)

549

else:

550

# Split after non-whitespace. The continuation

551

# whitespace comes from the instance variable.

552

holding.push(part + splitpart)

553

self._current_line.push(str(holding))

554

holding.reset()

555

self._lines.append(str(self._current_line))

556

if nextpart[0].isspace():

557

self._current_line.reset()

558

else:

559

self._current_line.reset(self._continuation_ws)

560

# Get the last of the holding part

561

self._current_line.push(str(holding))

def _spliterator(character, string):

566

parts = list(reversed(re.split('(%s)' % character, string)))

567

while parts:

568

part = parts.pop()

569

splitparts = (parts.pop() if parts else None)

570

nextpart = (parts.pop() if parts else None)

571

yield (part, splitparts, nextpart)

572

if nextpart is not None:

573

parts.append(nextpart)

574

575

576

class _Accumulator:

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

577

def __init__(self, initial_size=0):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

578

self._initial_size = initial_size

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

579

self._current = []

580

581

def push(self, string):

582

self._current.append(string)

583

584

def pop(self):

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

585

if not self._current:

586

return None

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

587

return self._current.pop()

588

589

def __len__(self):

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

590

return sum(((1 if string is None else len(string))

591

for string in self._current),

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

592

self._initial_size)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

593

594

def __str__(self):

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

595

if self._current and self._current[-1] is None:

596

self._current.pop()

597

return EMPTYSTRING.join((' ' if string is None else string)

598

for string in self._current)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

599

600

def reset(self, string=None):

601

self._current = []

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

602

self._initial_size = 0

603

if string is not None:

604

self.push(string)

Guido van Rossum