Blame - Lib/textwrap.py - platform/external/python/cpython3

2002-07-04 14:51:49 +0000

[diff] [blame]

1

"""Text wrapping and filling.

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

2

"""

3

Greg Ward

78cc051

2002-10-13 19:23:18 +0000

[diff] [blame]

4

Greg Ward

523008c

2003-06-15 15:37:18 +0000

[diff] [blame]

5

Greg Ward

698d9f0

2002-06-07 22:40:23 +0000

[diff] [blame]

6

# Written by Greg Ward <gward@python.net>

7

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

8

__revision__ = "$Id$"

import string, re

Greg Ward

2003-06-15 15:37:18 +0000

[diff] [blame]

12

# Do the right thing with boolean values for all known Python versions

13

# (so this module can be copied to projects that don't depend on Python

14

# 2.3, e.g. Optik and Docutils).

try:

True, False

except NameError:

(True, False) = (1, 0)

19

Greg Ward

2003-02-03 14:46:57 +0000

[diff] [blame]

20

__all__ = ['TextWrapper', 'wrap', 'fill']

21

Greg Ward

afd44de

2002-12-12 17:24:35 +0000

[diff] [blame]

22

# Hardcode the recognized whitespace characters to the US-ASCII

23

# whitespace characters. The main reason for doing this is that in

24

# ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales

25

# that character winds up in string.whitespace. Respecting

26

# string.whitespace in those cases would 1) make textwrap treat 0xa0 the

27

# same as any other whitespace char, which is clearly wrong (it's a

28

# *non-breaking* space), 2) possibly cause problems with Unicode,

29

# since 0xa0 is not in range(128).

Greg Ward

2003-02-03 14:46:57 +0000

[diff] [blame]

30

_whitespace = '\t\n\x0b\x0c\r '

Greg Ward

afd44de

2002-12-12 17:24:35 +0000

[diff] [blame]

31

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

32

class TextWrapper:

33

"""

34

Object for wrapping/filling text. The public interface consists of

35

the wrap() and fill() methods; the other methods are just there for

36

subclasses to override in order to tweak the default behaviour.

37

If you want to completely replace the main wrapping algorithm,

38

you'll probably have to override _wrap_chunks().

39

Greg Ward

2002-06-10 20:26:02 +0000

[diff] [blame]

40

Several instance attributes control various aspects of wrapping:

41

width (default: 70)

42

the maximum width of wrapped lines (unless break_long_words

43

is false)

Greg Ward

2002-06-10 21:37:12 +0000

[diff] [blame]

44

initial_indent (default: "")

45

string that will be prepended to the first line of wrapped

46

output. Counts towards the line's width.

47

subsequent_indent (default: "")

48

string that will be prepended to all lines save the first

49

of wrapped output; also counts towards each line's width.

Greg Ward

2002-06-07 21:56:16 +0000

[diff] [blame]

50

expand_tabs (default: true)

51

Expand tabs in input text to spaces before further processing.

52

Each tab will become 1 .. 8 spaces, depending on its position in

53

its line. If false, each tab is treated as a single character.

54

replace_whitespace (default: true)

55

Replace all whitespace characters in the input text by spaces

56

after tab expansion. Note that if expand_tabs is false and

57

replace_whitespace is true, every tab will be converted to a

58

single space!

59

fix_sentence_endings (default: false)

60

Ensure that sentence-ending punctuation is always followed

Andrew M. Kuchling

a2ecabe

2003-02-14 01:14:15 +0000

[diff] [blame]

61

by two spaces. Off by default because the algorithm is

Greg Ward

2002-06-07 21:56:16 +0000

[diff] [blame]

62

(unavoidably) imperfect.

63

break_long_words (default: true)

Greg Ward

2002-06-10 20:26:02 +0000

[diff] [blame]

64

Break words longer than 'width'. If false, those words will not

65

be broken, and some lines might be longer than 'width'.

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

66

"""

67

Greg Ward

2003-02-03 14:46:57 +0000

[diff] [blame]

68

whitespace_trans = string.maketrans(_whitespace, ' ' * len(_whitespace))

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

69

Greg Ward

2e74541

2002-12-09 16:23:08 +0000

[diff] [blame]

70

unicode_whitespace_trans = {}

Greg Ward

0e88c9f

2002-12-11 13:54:20 +0000

[diff] [blame]

71

uspace = ord(u' ')

Greg Ward

2003-02-03 14:46:57 +0000

[diff] [blame]

72

for x in map(ord, _whitespace):

Greg Ward

0e88c9f

2002-12-11 13:54:20 +0000

[diff] [blame]

73

unicode_whitespace_trans[x] = uspace

Greg Ward

2e74541

2002-12-09 16:23:08 +0000

[diff] [blame]

74

Tim Peters

c411dba

2002-07-16 21:35:23 +0000

[diff] [blame]

75

# This funky little regex is just the trick for splitting

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

76

# text up into word-wrappable chunks. E.g.

77

# "Hello there -- you goof-ball, use the -b option!"

78

# splits into

79

# Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!

80

# (after stripping out empty strings).

Greg Ward

4040794

2005-03-05 02:53:17 +0000

[diff] [blame]

81

wordsep_re = re.compile(

82

r'(\s+|' # any whitespace

83

r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words

84

r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

85

Greg Ward

6186410

2004-06-03 01:59:41 +0000

[diff] [blame]

86

# XXX this is not locale- or charset-aware -- string.lowercase

87

# is US-ASCII only (and therefore English-only)

Greg Ward

9b4864e

2002-06-07 22:04:15 +0000

[diff] [blame]

88

sentence_end_re = re.compile(r'[%s]' # lowercase letter

89

r'[\.\!\?]' # sentence-ending punct.

90

r'[\"\']?' # optional end-of-quote

91

% string.lowercase)

Greg Ward

2002-06-07 21:56:16 +0000

[diff] [blame]

92

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

93

Greg Ward

f0ba764

2004-05-13 01:53:10 +0000

[diff] [blame]

def __init__(self,

width=70,

initial_indent="",

subsequent_indent="",

98

expand_tabs=True,

99

replace_whitespace=True,

100

fix_sentence_endings=False,

101

break_long_words=True):

Greg Ward

2002-06-10 20:26:02 +0000

[diff] [blame]

102

self.width = width

Greg Ward

2002-06-10 21:37:12 +0000

[diff] [blame]

103

self.initial_indent = initial_indent

104

self.subsequent_indent = subsequent_indent

Greg Ward

47df99d

2002-06-09 00:22:07 +0000

[diff] [blame]

105

self.expand_tabs = expand_tabs

106

self.replace_whitespace = replace_whitespace

107

self.fix_sentence_endings = fix_sentence_endings

108

self.break_long_words = break_long_words

Tim Peters

c411dba

2002-07-16 21:35:23 +0000

[diff] [blame]

109

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

110

111

# -- Private methods -----------------------------------------------

112

# (possibly useful for subclasses to override)

113

Greg Ward

cb320eb

2002-06-07 22:32:15 +0000

[diff] [blame]

114

def _munge_whitespace(self, text):

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

115

"""_munge_whitespace(text : string) -> string

116

117

Munge whitespace in text: expand tabs and convert all other

118

whitespace characters to spaces. Eg. " foo\tbar\n\nbaz"

119

becomes " foo bar baz".

120

"""

121

if self.expand_tabs:

122

text = text.expandtabs()

123

if self.replace_whitespace:

Greg Ward

2e74541

2002-12-09 16:23:08 +0000

[diff] [blame]

124

if isinstance(text, str):

125

text = text.translate(self.whitespace_trans)

126

elif isinstance(text, unicode):

127

text = text.translate(self.unicode_whitespace_trans)

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

return text

Greg Ward

2002-06-07 22:32:15 +0000

[diff] [blame]

131

def _split(self, text):

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

132

"""_split(text : string) -> [string]

133

134

Split the text to wrap into indivisible chunks. Chunks are

135

not quite the same as words; see wrap_chunks() for full

136

details. As an example, the text

137

Look, goof-ball -- use the -b option!

138

breaks into the following chunks:

139

'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',

140

'use', ' ', 'the', ' ', '-b', ' ', 'option!'

141

"""

142

chunks = self.wordsep_re.split(text)

143

chunks = filter(None, chunks)

144

return chunks

145

Greg Ward

cb320eb

2002-06-07 22:32:15 +0000

[diff] [blame]

146

def _fix_sentence_endings(self, chunks):

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

147

"""_fix_sentence_endings(chunks : [string])

148

149

Correct for sentence endings buried in 'chunks'. Eg. when the

150

original text contains "... foo.\nBar ...", munge_whitespace()

151

and split() will convert that to [..., "foo.", " ", "Bar", ...]

152

which has one too few spaces; this method simply changes the one

153

space to two.

154

"""

155

i = 0

Greg Ward

9b4864e

2002-06-07 22:04:15 +0000

[diff] [blame]

156

pat = self.sentence_end_re

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

157

while i < len(chunks)-1:

Greg Ward

9b4864e

2002-06-07 22:04:15 +0000

[diff] [blame]

158

if chunks[i+1] == " " and pat.search(chunks[i]):

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

chunks[i+1] = " "

i += 2

else:

i += 1

Raymond Hettinger

2005-07-15 06:53:35 +0000

[diff] [blame]

164

def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

165

"""_handle_long_word(chunks : [string],

166

cur_line : [string],

Greg Ward

2002-06-10 21:37:12 +0000

[diff] [blame]

167

cur_len : int, width : int)

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

168

169

Handle a chunk of text (most likely a word, not whitespace) that

170

is too long to fit in any line.

171

"""

Raymond Hettinger

c11dbcd

2003-08-30 14:43:55 +0000

[diff] [blame]

172

space_left = max(width - cur_len, 1)

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

173

174

# If we're allowed to break long words, then do so: put as much

175

# of the next chunk onto the current line as will fit.

176

if self.break_long_words:

Raymond Hettinger

2005-07-15 06:53:35 +0000

[diff] [blame]

177

cur_line.append(reversed_chunks[-1][:space_left])

178

reversed_chunks[-1] = reversed_chunks[-1][space_left:]

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

179

180

# Otherwise, we have to preserve the long word intact. Only add

181

# it to the current line if there's nothing already there --

182

# that minimizes how much we violate the width constraint.

183

elif not cur_line:

Raymond Hettinger

2005-07-15 06:53:35 +0000

[diff] [blame]

184

cur_line.append(reversed_chunks.pop())

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

185

186

# If we're not allowed to break long words, and there's already

187

# text on the current line, do nothing. Next time through the

188

# main loop of _wrap_chunks(), we'll wind up here again, but

189

# cur_len will be zero, so the next line will be entirely

190

# devoted to the long word that we can't handle right now.

191

Greg Ward

2002-06-10 20:26:02 +0000

[diff] [blame]

192

def _wrap_chunks(self, chunks):

193

"""_wrap_chunks(chunks : [string]) -> [string]

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

194

195

Wrap a sequence of text chunks and return a list of lines of

Greg Ward

2002-06-10 20:26:02 +0000

[diff] [blame]

196

length 'self.width' or less. (If 'break_long_words' is false,

197

some lines may be longer than this.) Chunks correspond roughly

198

to words and the whitespace between them: each chunk is

199

indivisible (modulo 'break_long_words'), but a line break can

200

come between any two chunks. Chunks should not have internal

201

whitespace; ie. a chunk is either all whitespace or a "word".

202

Whitespace chunks will be removed from the beginning and end of

203

lines, but apart from that whitespace is preserved.

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

204

"""

205

lines = []

Greg Ward

21820cd

2003-05-07 00:55:35 +0000

[diff] [blame]

206

if self.width <= 0:

207

raise ValueError("invalid width %r (must be > 0)" % self.width)

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

208

Raymond Hettinger

2005-07-15 06:53:35 +0000

[diff] [blame]

209

# Arrange in reverse order so items can be efficiently popped

210

# from a stack of chucks.

211

chunks.reverse()

212

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

213

while chunks:

214

Greg Ward

2002-06-10 21:37:12 +0000

[diff] [blame]

215

# Start the list of chunks that will make up the current line.

216

# cur_len is just the length of all the chunks in cur_line.

cur_line = []

cur_len = 0

# Figure out which static string will prefix this line.

221

if lines:

222

indent = self.subsequent_indent

223

else:

224

indent = self.initial_indent

225

226

# Maximum width for this line.

227

width = self.width - len(indent)

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

228

Greg Ward

ab73d46

2002-12-09 16:26:05 +0000

[diff] [blame]

229

# First chunk on line is whitespace -- drop it, unless this

230

# is the very beginning of the text (ie. no lines started yet).

Raymond Hettinger

2005-07-15 06:53:35 +0000

[diff] [blame]

231

if chunks[-1].strip() == '' and lines:

232

del chunks[-1]

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

233

234

while chunks:

Raymond Hettinger

2005-07-15 06:53:35 +0000

[diff] [blame]

235

l = len(chunks[-1])

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

236

237

# Can at least squeeze this chunk onto the current line.

238

if cur_len + l <= width:

Raymond Hettinger

2005-07-15 06:53:35 +0000

[diff] [blame]

239

cur_line.append(chunks.pop())

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

240

cur_len += l

241

242

# Nope, this line is full.

else:

break

# The current line is full, and the next chunk is too big to

Tim Peters

c411dba

2002-07-16 21:35:23 +0000

[diff] [blame]

247

# fit on *any* line (not just this one).

Raymond Hettinger

2005-07-15 06:53:35 +0000

[diff] [blame]

248

if chunks and len(chunks[-1]) > width:

Greg Ward

2002-06-10 21:37:12 +0000

[diff] [blame]

249

self._handle_long_word(chunks, cur_line, cur_len, width)

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

250

251

# If the last chunk on this line is all whitespace, drop it.

252

if cur_line and cur_line[-1].strip() == '':

253

del cur_line[-1]

254

255

# Convert current line back to a string and store it in list

256

# of all lines (return value).

257

if cur_line:

Greg Ward

2002-06-10 21:37:12 +0000

[diff] [blame]

258

lines.append(indent + ''.join(cur_line))

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

return lines

# -- Public interface ----------------------------------------------

264

Greg Ward

2002-06-10 20:26:02 +0000

[diff] [blame]

265

def wrap(self, text):

266

"""wrap(text : string) -> [string]

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

267

Greg Ward

2002-07-04 14:51:49 +0000

[diff] [blame]

268

Reformat the single paragraph in 'text' so it fits in lines of

269

no more than 'self.width' columns, and return a list of wrapped

270

lines. Tabs in 'text' are expanded with string.expandtabs(),

271

and all other whitespace characters (including newline) are

272

converted to space.

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

273

"""

274

text = self._munge_whitespace(text)

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

275

chunks = self._split(text)

Greg Ward

2002-06-07 21:56:16 +0000

[diff] [blame]

276

if self.fix_sentence_endings:

277

self._fix_sentence_endings(chunks)

Greg Ward

2002-06-10 20:26:02 +0000

[diff] [blame]

278

return self._wrap_chunks(chunks)

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

279

Greg Ward

2002-06-10 21:37:12 +0000

[diff] [blame]

280

def fill(self, text):

281

"""fill(text : string) -> string

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

282

Greg Ward

2002-07-04 14:51:49 +0000

[diff] [blame]

283

Reformat the single paragraph in 'text' to fit in lines of no

284

more than 'self.width' columns, and return a new string

285

containing the entire wrapped paragraph.

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

286

"""

Greg Ward

2002-06-10 21:37:12 +0000

[diff] [blame]

287

return "\n".join(self.wrap(text))

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

288

289

Greg Ward

2002-07-04 14:51:49 +0000

[diff] [blame]

290

# -- Convenience interface ---------------------------------------------

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

291

Greg Ward

cf02ac6

2002-06-10 20:36:07 +0000

[diff] [blame]

292

def wrap(text, width=70, **kwargs):

Greg Ward

2002-07-04 14:51:49 +0000

[diff] [blame]

293

"""Wrap a single paragraph of text, returning a list of wrapped lines.

294

295

Reformat the single paragraph in 'text' so it fits in lines of no

296

more than 'width' columns, and return a list of wrapped lines. By

297

default, tabs in 'text' are expanded with string.expandtabs(), and

298

all other whitespace characters (including newline) are converted to

299

space. See TextWrapper class for available keyword args to customize

300

wrapping behaviour.

301

"""

Greg Ward

cf02ac6

2002-06-10 20:36:07 +0000

[diff] [blame]

302

w = TextWrapper(width=width, **kwargs)

303

return w.wrap(text)

Greg Ward

2002-06-07 21:43:37 +0000

[diff] [blame]

304

Greg Ward

2002-06-10 21:37:12 +0000

[diff] [blame]

305

def fill(text, width=70, **kwargs):

Greg Ward

2002-07-04 14:51:49 +0000

[diff] [blame]

306

"""Fill a single paragraph of text, returning a new string.

307

308

Reformat the single paragraph in 'text' to fit in lines of no more

309

than 'width' columns, and return a new string containing the entire

310

wrapped paragraph. As with wrap(), tabs are expanded and other

311

whitespace characters converted to space. See TextWrapper class for

312

available keyword args to customize wrapping behaviour.

313

"""

Greg Ward

cf02ac6

2002-06-10 20:36:07 +0000

[diff] [blame]

314

w = TextWrapper(width=width, **kwargs)

Greg Ward

2002-06-10 21:37:12 +0000

[diff] [blame]

315

return w.fill(text)

Greg Ward

2003-05-08 01:58:05 +0000

[diff] [blame]

316

317

318

# -- Loosely related functionality -------------------------------------

319

Greg Ward

2006-06-11 00:40:49 +0000

[diff] [blame]

320

_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)

321

_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)

322

Greg Ward

2003-05-08 01:58:05 +0000

[diff] [blame]

323

def dedent(text):

Greg Ward

2006-06-11 00:40:49 +0000

[diff] [blame]

324

"""Remove any common leading whitespace from every line in `text`.

Greg Ward

2003-05-08 01:58:05 +0000

[diff] [blame]

325

Greg Ward

2006-06-11 00:40:49 +0000

[diff] [blame]

326

This can be used to make triple-quoted strings line up with the left

327

edge of the display, while still presenting them in the source code

328

in indented form.

Greg Ward

2003-05-08 01:58:05 +0000

[diff] [blame]

329

Greg Ward

2006-06-11 00:40:49 +0000

[diff] [blame]

330

Note that tabs and spaces are both treated as whitespace, but they

331

are not equal: the lines " hello" and "\thello" are

332

considered to have no common leading whitespace. (This behaviour is

333

new in Python 2.5; older versions of this module incorrectly

334

expanded tabs before searching for common leading whitespace.)

Greg Ward

2003-05-08 01:58:05 +0000

[diff] [blame]

335

"""

Greg Ward

2006-06-11 00:40:49 +0000

[diff] [blame]

336

# Look for the longest leading string of spaces and tabs common to

337

# all lines.

Greg Ward

2003-05-08 01:58:05 +0000

[diff] [blame]

338

margin = None

Greg Ward

2006-06-11 00:40:49 +0000

[diff] [blame]

339

text = _whitespace_only_re.sub('', text)

340

indents = _leading_whitespace_re.findall(text)

341

for indent in indents:

Greg Ward

2003-05-08 01:58:05 +0000

[diff] [blame]

342

if margin is None:

343

margin = indent

Greg Ward

2006-06-11 00:40:49 +0000

[diff] [blame]

344

345

# Current line more deeply indented than previous winner:

346

# no change (previous winner is still on top).

Tim Peters

4f96f1f

2006-06-11 19:42:51 +0000

[diff] [blame]

347

elif indent.startswith(margin):

348

pass

Greg Ward

2006-06-11 00:40:49 +0000

[diff] [blame]

349

350

# Current line consistent with and no deeper than previous winner:

351

# it's the new winner.

Tim Peters

4f96f1f

2006-06-11 19:42:51 +0000

[diff] [blame]

352

elif margin.startswith(indent):

353

margin = indent

Greg Ward

2006-06-11 00:40:49 +0000

[diff] [blame]

354

355

# Current line and previous winner have no common whitespace:

356

# there is no margin.

Greg Ward

2003-05-08 01:58:05 +0000

[diff] [blame]

357

else:

Greg Ward

2006-06-11 00:40:49 +0000

[diff] [blame]

358

margin = ""

359

break

Greg Ward

2003-05-08 01:58:05 +0000

[diff] [blame]

360

Greg Ward

2006-06-11 00:40:49 +0000

[diff] [blame]

361

# sanity check (testing/debugging only)

362

if 0 and margin:

363

for line in text.split("\n"):

364

assert not line or line.startswith(margin), \

365

"line = %r, margin = %r" % (line, margin)

Greg Ward

2003-05-08 01:58:05 +0000

[diff] [blame]

366

Greg Ward