Blame - Lib/email/header.py - platform/external/python/cpython2

2007-08-30 01:15:14 +0000

[diff] [blame]

1

2

# Author: Ben Gertzfield, Barry Warsaw

3

# Contact: email-sig@python.org

4

5

"""Header encoding and decoding functionality."""

__all__ = [

'Header',

'decode_header',

'make_header',

]

import re

import binascii

import email.quoprimime

17

import email.base64mime

18

19

from email.errors import HeaderParseError

R. David Murray

9253214

2011-01-07 23:25:30 +0000

[diff] [blame]

20

from email import charset as _charset

21

Charset = _charset.Charset

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

NL = '\n'

SPACE = ' '

BSPACE = b' '

SPACE8 = ' ' * 8

EMPTYSTRING = ''

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

28

MAXLINELEN = 78

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

29

30

USASCII = Charset('us-ascii')

31

UTF8 = Charset('utf-8')

32

33

# Match encoded-word strings in the form =?charset?q?Hello_World?=

34

ecre = re.compile(r'''

35

=\? # literal =?

36

(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset

37

\? # literal ?

38

(?P<encoding>[qb]) # either a "q" or a "b", case insensitive

39

\? # literal ?

40

(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string

41

\?= # literal ?=

42

(?=[ \t]|$) # whitespace or the end of the string

43

''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)

44

45

# Field name regexp, including trailing colon, but not separating whitespace,

46

# according to RFC 2822. Character range is from tilde to exclamation mark.

47

# For use with .match()

48

fcre = re.compile(r'[\041-\176]+:$')

# Helpers

_max_append = email.quoprimime._max_append

def decode_header(header):

58

"""Decode a message header value without converting charset.

59

60

Returns a list of (string, charset) pairs containing each of the decoded

61

parts of the header. Charset is None for non-encoded parts of the header,

62

otherwise a lower-case string containing the name of the character set

63

specified in the encoded string.

64

Amaury Forgeot d'Arc

1c25de6

2009-07-12 16:43:19 +0000

[diff] [blame]

65

An email.errors.HeaderParseError may be raised when certain decoding error

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

66

occurs (e.g. a base64 decoding exception).

67

"""

68

# If no encoding, just return the header with no charset.

69

if not ecre.search(header):

70

return [(header, None)]

71

# First step is to parse all the encoded parts into triplets of the form

72

# (encoded_string, encoding, charset). For unencoded strings, the last

73

# two parts will be None.

74

words = []

75

for line in header.splitlines():

76

parts = ecre.split(line)

77

while parts:

78

unencoded = parts.pop(0).strip()

79

if unencoded:

80

words.append((unencoded, None, None))

81

if parts:

82

charset = parts.pop(0).lower()

83

encoding = parts.pop(0).lower()

84

encoded = parts.pop(0)

85

words.append((encoded, encoding, charset))

86

# The next step is to decode each encoded word by applying the reverse

87

# base64 or quopri transformation. decoded_words is now a list of the

88

# form (decoded_word, charset).

89

decoded_words = []

90

for encoded_string, encoding, charset in words:

91

if encoding is None:

92

# This is an unencoded word.

93

decoded_words.append((encoded_string, charset))

94

elif encoding == 'q':

95

word = email.quoprimime.header_decode(encoded_string)

96

decoded_words.append((word, charset))

97

elif encoding == 'b':

R. David Murray

c4e69cc

2010-08-03 22:14:10 +0000

[diff] [blame]

98

paderr = len(encoded_string) % 4 # Postel's law: add missing padding

99

if paderr:

100

encoded_string += '==='[:4 - paderr]

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

101

try:

102

word = email.base64mime.decode(encoded_string)

103

except binascii.Error:

104

raise HeaderParseError('Base64 decoding error')

105

else:

106

decoded_words.append((word, charset))

107

else:

108

raise AssertionError('Unexpected encoding: ' + encoding)

109

# Now convert all words to bytes and collapse consecutive runs of

110

# similarly encoded words.

111

collapsed = []

112

last_word = last_charset = None

113

for word, charset in decoded_words:

114

if isinstance(word, str):

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

115

word = bytes(word, 'raw-unicode-escape')

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

116

if last_word is None:

117

last_word = word

118

last_charset = charset

119

elif charset != last_charset:

120

collapsed.append((last_word, last_charset))

121

last_word = word

122

last_charset = charset

123

elif last_charset is None:

124

last_word += BSPACE + word

125

else:

126

last_word += word

127

collapsed.append((last_word, last_charset))

return collapsed

def make_header(decoded_seq, maxlinelen=None, header_name=None,

133

continuation_ws=' '):

134

"""Create a Header from a sequence of pairs as returned by decode_header()

135

136

decode_header() takes a header value string and returns a sequence of

137

pairs of the format (decoded_string, charset) where charset is the string

138

name of the character set.

139

140

This function takes one of those sequence of pairs and returns a Header

141

instance. Optional maxlinelen, header_name, and continuation_ws are as in

142

the Header constructor.

143

"""

144

h = Header(maxlinelen=maxlinelen, header_name=header_name,

145

continuation_ws=continuation_ws)

146

for s, charset in decoded_seq:

147

# None means us-ascii but we can simply pass it on to h.append()

148

if charset is not None and not isinstance(charset, Charset):

149

charset = Charset(charset)

h.append(s, charset)

return h

class Header:

def __init__(self, s=None, charset=None,

157

maxlinelen=None, header_name=None,

158

continuation_ws=' ', errors='strict'):

159

"""Create a MIME-compliant header that can contain many character sets.

160

161

Optional s is the initial header value. If None, the initial header

162

value is not set. You can later append to the header with .append()

163

method calls. s may be a byte string or a Unicode string, but see the

164

.append() documentation for semantics.

165

166

Optional charset serves two purposes: it has the same meaning as the

167

charset argument to the .append() method. It also sets the default

168

character set for all subsequent .append() calls that omit the charset

169

argument. If charset is not provided in the constructor, the us-ascii

170

charset is used both as s's initial charset and as the default for

171

subsequent .append() calls.

172

R. David Murray

4c1da4c

2010-12-29 16:57:24 +0000

[diff] [blame]

173

The maximum line length can be specified explicitly via maxlinelen. For

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

174

splitting the first line to a shorter value (to account for the field

175

header which isn't included in s, e.g. `Subject') pass in the name of

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

176

the field in header_name. The default maxlinelen is 78 as recommended

177

by RFC 2822.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

178

179

continuation_ws must be RFC 2822 compliant folding whitespace (usually

180

either a space or a hard tab) which will be prepended to continuation

181

lines.

182

183

errors is passed through to the .append() call.

"""

if charset is None:

charset = USASCII

elif not isinstance(charset, Charset):

188

charset = Charset(charset)

189

self._charset = charset

190

self._continuation_ws = continuation_ws

191

self._chunks = []

192

if s is not None:

193

self.append(s, charset, errors)

194

if maxlinelen is None:

195

maxlinelen = MAXLINELEN

196

self._maxlinelen = maxlinelen

197

if header_name is None:

198

self._headerlen = 0

199

else:

200

# Take the separating colon and space into account.

201

self._headerlen = len(header_name) + 2

202

203

def __str__(self):

204

"""Return the string value of the header."""

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

205

self._normalize()

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

206

uchunks = []

207

lastcs = None

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

208

for string, charset in self._chunks:

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

209

# We must preserve spaces between encoded and non-encoded word

210

# boundaries, which means for us we need to add a space when we go

211

# from a charset to None/us-ascii, or from None/us-ascii to a

212

# charset. Only do this for the second and subsequent chunks.

213

nextcs = charset

R. David Murray

9253214

2011-01-07 23:25:30 +0000

[diff] [blame]

214

if nextcs == _charset.UNKNOWN8BIT:

215

original_bytes = string.encode('ascii', 'surrogateescape')

216

string = original_bytes.decode('ascii', 'replace')

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

217

if uchunks:

218

if lastcs not in (None, 'us-ascii'):

219

if nextcs in (None, 'us-ascii'):

220

uchunks.append(SPACE)

221

nextcs = None

222

elif nextcs not in (None, 'us-ascii'):

223

uchunks.append(SPACE)

224

lastcs = nextcs

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

225

uchunks.append(string)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

226

return EMPTYSTRING.join(uchunks)

227

228

# Rich comparison operators for equality only. BAW: does it make sense to

229

# have or explicitly disable <, <=, >, >= operators?

230

def __eq__(self, other):

231

# other may be a Header or a string. Both are fine so coerce

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

232

# ourselves to a unicode (of the unencoded header value), swap the

233

# args and do another comparison.

234

return other == str(self)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

235

236

def __ne__(self, other):

237

return not self == other

238

239

def append(self, s, charset=None, errors='strict'):

240

"""Append a string to the MIME header.

241

242

Optional charset, if given, should be a Charset instance or the name

243

of a character set (which will be converted to a Charset instance). A

244

value of None (the default) means that the charset given in the

245

constructor is used.

246

247

s may be a byte string or a Unicode string. If it is a byte string

R. David Murray

4c1da4c

2010-12-29 16:57:24 +0000

[diff] [blame]

248

(i.e. isinstance(s, str) is false), then charset is the encoding of

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

249

that byte string, and a UnicodeError will be raised if the string

250

cannot be decoded with that charset. If s is a Unicode string, then

251

charset is a hint specifying the character set of the characters in

R. David Murray

2011-01-05 01:39:32 +0000

[diff] [blame]

252

the string. In either case, when producing an RFC 2822 compliant

253

header using RFC 2047 rules, the string will be encoded using the

254

output codec of the charset. If the string cannot be encoded to the

255

output codec, a UnicodeError will be raised.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

256

R. David Murray

2011-01-05 01:39:32 +0000

[diff] [blame]

257

Optional `errors' is passed as the errors argument to the decode

258

call if s is a byte string.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

259

"""

260

if charset is None:

261

charset = self._charset

262

elif not isinstance(charset, Charset):

263

charset = Charset(charset)

R. David Murray

2011-01-05 01:39:32 +0000

[diff] [blame]

264

if not isinstance(s, str):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

265

input_charset = charset.input_codec or 'us-ascii'

R. David Murray

2011-01-05 01:39:32 +0000

[diff] [blame]

266

s = s.decode(input_charset, errors)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

267

# Ensure that the bytes we're storing can be decoded to the output

268

# character set, otherwise an early error is thrown.

269

output_charset = charset.output_codec or 'us-ascii'

R. David Murray

9253214

2011-01-07 23:25:30 +0000

[diff] [blame]

270

if output_charset != _charset.UNKNOWN8BIT:

271

s.encode(output_charset, errors)

R. David Murray

2011-01-05 01:39:32 +0000

[diff] [blame]

272

self._chunks.append((s, charset))

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

273

R. David Murray

2010-10-23 22:19:56 +0000

[diff] [blame]

274

def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

275

"""Encode a message header into an RFC-compliant format.

276

277

There are many issues involved in converting a given string for use in

278

an email header. Only certain character sets are readable in most

279

email clients, and as header strings can only contain a subset of

280

7-bit ASCII, care must be taken to properly convert and encode (with

281

Base64 or quoted-printable) header strings. In addition, there is a

282

75-character length limit on any given encoded header field, so

283

line-wrapping must be performed, even with double-byte character sets.

284

285

This method will do its best to convert the string to the correct

286

character set used in email, and encode and line wrap it safely with

287

the appropriate scheme for that character set.

288

289

If the given charset is not known or an error occurs during

290

conversion, this function will return the header untouched.

291

292

Optional splitchars is a string containing characters to split long

293

ASCII lines on, in rough support of RFC 2822's `highest level

294

syntactic breaks'. This doesn't affect RFC 2047 encoded lines.

R. David Murray

2010-10-23 22:19:56 +0000

[diff] [blame]

295

296

Optional linesep is a string to be used to separate the lines of

297

the value. The default value is the most useful for typical

298

Python applications, but it can be set to \r\n to produce RFC-compliant

299

line separators when needed.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

300

"""

301

self._normalize()

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

302

if maxlinelen is None:

303

maxlinelen = self._maxlinelen

304

# A maxlinelen of 0 means don't wrap. For all practical purposes,

305

# choosing a huge number here accomplishes that and makes the

306

# _ValueFormatter algorithm much simpler.

307

if maxlinelen == 0:

308

maxlinelen = 1000000

309

formatter = _ValueFormatter(self._headerlen, maxlinelen,

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

310

self._continuation_ws, splitchars)

311

for string, charset in self._chunks:

312

lines = string.splitlines()

R. David Murray

6f0022d

2011-01-07 21:57:25 +0000

[diff] [blame]

313

formatter.feed(lines[0], charset)

314

for line in lines[1:]:

315

formatter.newline()

316

if charset.header_encoding is not None:

317

formatter.feed(self._continuation_ws, USASCII)

318

line = ' ' + line.lstrip()

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

319

formatter.feed(line, charset)

R. David Murray

6f0022d

2011-01-07 21:57:25 +0000

[diff] [blame]

320

if len(lines) > 1:

321

formatter.newline()

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

322

formatter.add_transition()

R. David Murray

2010-10-23 22:19:56 +0000

[diff] [blame]

323

return formatter._str(linesep)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

324

325

def _normalize(self):

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

326

# Step 1: Normalize the chunks so that all runs of identical charsets

327

# get collapsed into a single unicode string.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

chunks = []

last_charset = None

last_chunk = []

for string, charset in self._chunks:

332

if charset == last_charset:

333

last_chunk.append(string)

334

else:

335

if last_charset is not None:

336

chunks.append((SPACE.join(last_chunk), last_charset))

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

337

last_chunk = [string]

338

last_charset = charset

339

if last_chunk:

340

chunks.append((SPACE.join(last_chunk), last_charset))

341

self._chunks = chunks

class _ValueFormatter:

346

def __init__(self, headerlen, maxlen, continuation_ws, splitchars):

347

self._maxlen = maxlen

348

self._continuation_ws = continuation_ws

349

self._continuation_ws_len = len(continuation_ws.replace('\t', SPACE8))

350

self._splitchars = splitchars

351

self._lines = []

352

self._current_line = _Accumulator(headerlen)

353

R. David Murray

2010-10-23 22:19:56 +0000

[diff] [blame]

354

def _str(self, linesep):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

355

self.newline()

R. David Murray

2010-10-23 22:19:56 +0000

[diff] [blame]

356

return linesep.join(self._lines)

357

358

def __str__(self):

359

return self._str(NL)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

360

361

def newline(self):

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

362

end_of_line = self._current_line.pop()

363

if end_of_line is not None:

364

self._current_line.push(end_of_line)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

365

if len(self._current_line) > 0:

366

self._lines.append(str(self._current_line))

367

self._current_line.reset()

368

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

369

def add_transition(self):

370

self._current_line.push(None)

371

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

372

def feed(self, string, charset):

373

# If the string itself fits on the current line in its encoded format,

374

# then add it now and be done with it.

375

encoded_string = charset.header_encode(string)

376

if len(encoded_string) + len(self._current_line) <= self._maxlen:

377

self._current_line.push(encoded_string)

378

return

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

379

# If the charset has no header encoding (i.e. it is an ASCII encoding)

380

# then we must split the header at the "highest level syntactic break"

381

# possible. Note that we don't have a lot of smarts about field

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

382

# syntax; we just try to break on semi-colons, then commas, then

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

383

# whitespace. Eventually, this should be pluggable.

384

if charset.header_encoding is None:

385

for ch in self._splitchars:

if ch in string:

break

else:

ch = None

# If there's no available split character then regardless of

391

# whether the string fits on the line, we have to put it on a line

392

# by itself.

393

if ch is None:

394

if not self._current_line.is_onlyws():

395

self._lines.append(str(self._current_line))

396

self._current_line.reset(self._continuation_ws)

397

self._current_line.push(encoded_string)

398

else:

399

self._ascii_split(string, ch)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

400

return

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

401

# Otherwise, we're doing either a Base64 or a quoted-printable

402

# encoding which means we don't need to split the line on syntactic

403

# breaks. We can basically just find enough characters to fit on the

404

# current line, minus the RFC 2047 chrome. What makes this trickier

405

# though is that we have to split at octet boundaries, not character

406

# boundaries but it's only safe to split at character boundaries so at

407

# best we can only get close.

408

encoded_lines = charset.header_encode_lines(string, self._maxlengths())

409

# The first element extends the current line, but if it's None then

410

# nothing more fit on the current line so start a new line.

411

try:

412

first_line = encoded_lines.pop(0)

413

except IndexError:

414

# There are no encoded lines, so we're done.

415

return

416

if first_line is not None:

417

self._current_line.push(first_line)

418

self._lines.append(str(self._current_line))

419

self._current_line.reset(self._continuation_ws)

420

try:

421

last_line = encoded_lines.pop()

422

except IndexError:

423

# There was only one line.

424

return

425

self._current_line.push(last_line)

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

426

# Everything else are full lines in themselves.

427

for line in encoded_lines:

428

self._lines.append(self._continuation_ws + line)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

429

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

430

def _maxlengths(self):

431

# The first line's length.

432

yield self._maxlen - len(self._current_line)

433

while True:

434

yield self._maxlen - self._continuation_ws_len

435

436

def _ascii_split(self, string, ch):

437

holding = _Accumulator()

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

438

# Split the line on the split character, preserving it. If the split

439

# character is whitespace RFC 2822 $2.2.3 requires us to fold on the

440

# whitespace, so that the line leads with the original whitespace we

441

# split on. However, if a higher syntactic break is used instead

442

# (e.g. comma or semicolon), the folding should happen after the split

443

# character. But then in that case, we need to add our own

444

# continuation whitespace -- although won't that break unfolding?

445

for part, splitpart, nextpart in _spliterator(ch, string):

446

if not splitpart:

447

# No splitpart means this is the last chunk. Put this part

448

# either on the current line or the next line depending on

449

# whether it fits.

450

holding.push(part)

451

if len(holding) + len(self._current_line) <= self._maxlen:

452

# It fits, but we're done.

453

self._current_line.push(str(holding))

454

else:

455

# It doesn't fit, but we're done. Before pushing a new

456

# line, watch out for the current line containing only

457

# whitespace.

458

holding.pop()

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

459

if self._current_line.is_onlyws() and holding.is_onlyws():

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

460

# Don't start a new line.

461

holding.push(part)

462

part = None

463

self._current_line.push(str(holding))

464

self._lines.append(str(self._current_line))

465

if part is None:

466

self._current_line.reset()

467

else:

468

holding.reset(part)

469

self._current_line.reset(str(holding))

470

return

471

elif not nextpart:

472

# There must be some trailing split characters because we

473

# found a split character but no next part. In this case we

474

# must treat the thing to fit as the part + splitpart because

475

# if splitpart is whitespace it's not allowed to be the only

476

# thing on the line, and if it's not whitespace we must split

477

# after the syntactic break. In either case, we're done.

478

holding_prelen = len(holding)

479

holding.push(part + splitpart)

480

if len(holding) + len(self._current_line) <= self._maxlen:

481

self._current_line.push(str(holding))

482

elif holding_prelen == 0:

483

# This is the only chunk left so it has to go on the

484

# current line.

485

self._current_line.push(str(holding))

486

else:

487

save_part = holding.pop()

488

self._current_line.push(str(holding))

489

self._lines.append(str(self._current_line))

490

holding.reset(save_part)

491

self._current_line.reset(str(holding))

492

return

493

elif not part:

494

# We're leading with a split character. See if the splitpart

495

# and nextpart fits on the current line.

496

holding.push(splitpart + nextpart)

497

holding_len = len(holding)

498

# We know we're not leaving the nextpart on the stack.

499

holding.pop()

500

if holding_len + len(self._current_line) <= self._maxlen:

501

holding.push(splitpart)

502

else:

503

# It doesn't fit. Since there's no current part really

504

# the best we can do is start a new line and push the

505

# split part onto it.

506

self._current_line.push(str(holding))

507

holding.reset()

508

if len(self._current_line) > 0 and self._lines:

509

self._lines.append(str(self._current_line))

510

self._current_line.reset()

511

holding.push(splitpart)

512

else:

513

# All three parts are present. First let's see if all three

514

# parts will fit on the current line. If so, we don't need to

515

# split it.

516

holding.push(part + splitpart + nextpart)

517

holding_len = len(holding)

518

# Pop the part because we'll push nextpart on the next

519

# iteration through the loop.

520

holding.pop()

521

if holding_len + len(self._current_line) <= self._maxlen:

522

holding.push(part + splitpart)

523

else:

524

# The entire thing doesn't fit. See if we need to split

525

# before or after the split characters.

526

if splitpart.isspace():

527

# Split before whitespace. Remember that the

528

# whitespace becomes the continuation whitespace of

529

# the next line so it goes to current_line not holding.

530

holding.push(part)

531

self._current_line.push(str(holding))

532

holding.reset()

533

self._lines.append(str(self._current_line))

534

self._current_line.reset(splitpart)

535

else:

536

# Split after non-whitespace. The continuation

537

# whitespace comes from the instance variable.

538

holding.push(part + splitpart)

539

self._current_line.push(str(holding))

540

holding.reset()

541

self._lines.append(str(self._current_line))

542

if nextpart[0].isspace():

543

self._current_line.reset()

544

else:

545

self._current_line.reset(self._continuation_ws)

546

# Get the last of the holding part

547

self._current_line.push(str(holding))

def _spliterator(character, string):

552

parts = list(reversed(re.split('(%s)' % character, string)))

553

while parts:

554

part = parts.pop()

555

splitparts = (parts.pop() if parts else None)

556

nextpart = (parts.pop() if parts else None)

557

yield (part, splitparts, nextpart)

558

if nextpart is not None:

559

parts.append(nextpart)

560

561

562

class _Accumulator:

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

563

def __init__(self, initial_size=0):

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

564

self._initial_size = initial_size

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

565

self._current = []

566

567

def push(self, string):

568

self._current.append(string)

569

570

def pop(self):

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

571

if not self._current:

572

return None

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

573

return self._current.pop()

574

575

def __len__(self):

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

576

return sum(((1 if string is None else len(string))

577

for string in self._current),

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

578

self._initial_size)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

579

580

def __str__(self):

Barry Warsaw

2007-08-31 02:35:00 +0000

[diff] [blame]

581

if self._current and self._current[-1] is None:

582

self._current.pop()

583

return EMPTYSTRING.join((' ' if string is None else string)

584

for string in self._current)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

585

586

def reset(self, string=None):

587

self._current = []

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

588

self._initial_size = 0

589

if string is not None:

590

self.push(string)

Guido van Rossum