blob: ccff2aba1287b61ca96918c8be631f6f0305ddbf [file] [log] [blame]
Greg Warde807e572002-07-04 14:51:49 +00001"""Text wrapping and filling.
Greg Ward00935822002-06-07 21:43:37 +00002"""
3
Greg Ward78cc0512002-10-13 19:23:18 +00004# Copyright (C) 1999-2001 Gregory P. Ward.
Greg Ward523008c2003-06-15 15:37:18 +00005# Copyright (C) 2002, 2003 Python Software Foundation.
Greg Ward698d9f02002-06-07 22:40:23 +00006# Written by Greg Ward <gward@python.net>
7
Greg Ward00935822002-06-07 21:43:37 +00008__revision__ = "$Id$"
9
10import string, re
11
Greg Ward523008c2003-06-15 15:37:18 +000012# Do the right thing with boolean values for all known Python versions
13# (so this module can be copied to projects that don't depend on Python
14# 2.3, e.g. Optik and Docutils).
15try:
16 True, False
17except NameError:
18 (True, False) = (1, 0)
19
Greg Ward4c6c9c42003-02-03 14:46:57 +000020__all__ = ['TextWrapper', 'wrap', 'fill']
21
Greg Wardafd44de2002-12-12 17:24:35 +000022# Hardcode the recognized whitespace characters to the US-ASCII
23# whitespace characters. The main reason for doing this is that in
24# ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales
25# that character winds up in string.whitespace. Respecting
26# string.whitespace in those cases would 1) make textwrap treat 0xa0 the
27# same as any other whitespace char, which is clearly wrong (it's a
28# *non-breaking* space), 2) possibly cause problems with Unicode,
29# since 0xa0 is not in range(128).
Greg Ward4c6c9c42003-02-03 14:46:57 +000030_whitespace = '\t\n\x0b\x0c\r '
Greg Wardafd44de2002-12-12 17:24:35 +000031
Greg Ward00935822002-06-07 21:43:37 +000032class TextWrapper:
33 """
34 Object for wrapping/filling text. The public interface consists of
35 the wrap() and fill() methods; the other methods are just there for
36 subclasses to override in order to tweak the default behaviour.
37 If you want to completely replace the main wrapping algorithm,
38 you'll probably have to override _wrap_chunks().
39
Greg Wardd34c9592002-06-10 20:26:02 +000040 Several instance attributes control various aspects of wrapping:
41 width (default: 70)
42 the maximum width of wrapped lines (unless break_long_words
43 is false)
Greg Ward62080be2002-06-10 21:37:12 +000044 initial_indent (default: "")
45 string that will be prepended to the first line of wrapped
46 output. Counts towards the line's width.
47 subsequent_indent (default: "")
48 string that will be prepended to all lines save the first
49 of wrapped output; also counts towards each line's width.
Greg Ward62e4f3b2002-06-07 21:56:16 +000050 expand_tabs (default: true)
51 Expand tabs in input text to spaces before further processing.
52 Each tab will become 1 .. 8 spaces, depending on its position in
53 its line. If false, each tab is treated as a single character.
54 replace_whitespace (default: true)
55 Replace all whitespace characters in the input text by spaces
56 after tab expansion. Note that if expand_tabs is false and
57 replace_whitespace is true, every tab will be converted to a
58 single space!
59 fix_sentence_endings (default: false)
60 Ensure that sentence-ending punctuation is always followed
Andrew M. Kuchlinga2ecabe2003-02-14 01:14:15 +000061 by two spaces. Off by default because the algorithm is
Greg Ward62e4f3b2002-06-07 21:56:16 +000062 (unavoidably) imperfect.
63 break_long_words (default: true)
Greg Wardd34c9592002-06-10 20:26:02 +000064 Break words longer than 'width'. If false, those words will not
65 be broken, and some lines might be longer than 'width'.
Greg Ward00935822002-06-07 21:43:37 +000066 """
67
Greg Ward4c6c9c42003-02-03 14:46:57 +000068 whitespace_trans = string.maketrans(_whitespace, ' ' * len(_whitespace))
Greg Ward00935822002-06-07 21:43:37 +000069
Greg Ward2e745412002-12-09 16:23:08 +000070 unicode_whitespace_trans = {}
Greg Ward0e88c9f2002-12-11 13:54:20 +000071 uspace = ord(u' ')
Greg Ward4c6c9c42003-02-03 14:46:57 +000072 for x in map(ord, _whitespace):
Greg Ward0e88c9f2002-12-11 13:54:20 +000073 unicode_whitespace_trans[x] = uspace
Greg Ward2e745412002-12-09 16:23:08 +000074
Tim Petersc411dba2002-07-16 21:35:23 +000075 # This funky little regex is just the trick for splitting
Greg Ward00935822002-06-07 21:43:37 +000076 # text up into word-wrappable chunks. E.g.
77 # "Hello there -- you goof-ball, use the -b option!"
78 # splits into
79 # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
80 # (after stripping out empty strings).
Greg Ward40407942005-03-05 02:53:17 +000081 wordsep_re = re.compile(
82 r'(\s+|' # any whitespace
83 r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words
84 r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash
Greg Ward00935822002-06-07 21:43:37 +000085
Greg Ward61864102004-06-03 01:59:41 +000086 # XXX this is not locale- or charset-aware -- string.lowercase
87 # is US-ASCII only (and therefore English-only)
Greg Ward9b4864e2002-06-07 22:04:15 +000088 sentence_end_re = re.compile(r'[%s]' # lowercase letter
89 r'[\.\!\?]' # sentence-ending punct.
90 r'[\"\']?' # optional end-of-quote
91 % string.lowercase)
Greg Ward62e4f3b2002-06-07 21:56:16 +000092
Greg Ward00935822002-06-07 21:43:37 +000093
Greg Wardf0ba7642004-05-13 01:53:10 +000094 def __init__(self,
95 width=70,
96 initial_indent="",
97 subsequent_indent="",
98 expand_tabs=True,
99 replace_whitespace=True,
100 fix_sentence_endings=False,
101 break_long_words=True):
Greg Wardd34c9592002-06-10 20:26:02 +0000102 self.width = width
Greg Ward62080be2002-06-10 21:37:12 +0000103 self.initial_indent = initial_indent
104 self.subsequent_indent = subsequent_indent
Greg Ward47df99d2002-06-09 00:22:07 +0000105 self.expand_tabs = expand_tabs
106 self.replace_whitespace = replace_whitespace
107 self.fix_sentence_endings = fix_sentence_endings
108 self.break_long_words = break_long_words
Tim Petersc411dba2002-07-16 21:35:23 +0000109
Greg Ward00935822002-06-07 21:43:37 +0000110
111 # -- Private methods -----------------------------------------------
112 # (possibly useful for subclasses to override)
113
Greg Wardcb320eb2002-06-07 22:32:15 +0000114 def _munge_whitespace(self, text):
Greg Ward00935822002-06-07 21:43:37 +0000115 """_munge_whitespace(text : string) -> string
116
117 Munge whitespace in text: expand tabs and convert all other
118 whitespace characters to spaces. Eg. " foo\tbar\n\nbaz"
119 becomes " foo bar baz".
120 """
121 if self.expand_tabs:
122 text = text.expandtabs()
123 if self.replace_whitespace:
Greg Ward2e745412002-12-09 16:23:08 +0000124 if isinstance(text, str):
125 text = text.translate(self.whitespace_trans)
126 elif isinstance(text, unicode):
127 text = text.translate(self.unicode_whitespace_trans)
Greg Ward00935822002-06-07 21:43:37 +0000128 return text
129
130
Greg Wardcb320eb2002-06-07 22:32:15 +0000131 def _split(self, text):
Greg Ward00935822002-06-07 21:43:37 +0000132 """_split(text : string) -> [string]
133
134 Split the text to wrap into indivisible chunks. Chunks are
135 not quite the same as words; see wrap_chunks() for full
136 details. As an example, the text
137 Look, goof-ball -- use the -b option!
138 breaks into the following chunks:
139 'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
140 'use', ' ', 'the', ' ', '-b', ' ', 'option!'
141 """
142 chunks = self.wordsep_re.split(text)
143 chunks = filter(None, chunks)
144 return chunks
145
Greg Wardcb320eb2002-06-07 22:32:15 +0000146 def _fix_sentence_endings(self, chunks):
Greg Ward00935822002-06-07 21:43:37 +0000147 """_fix_sentence_endings(chunks : [string])
148
149 Correct for sentence endings buried in 'chunks'. Eg. when the
150 original text contains "... foo.\nBar ...", munge_whitespace()
151 and split() will convert that to [..., "foo.", " ", "Bar", ...]
152 which has one too few spaces; this method simply changes the one
153 space to two.
154 """
155 i = 0
Greg Ward9b4864e2002-06-07 22:04:15 +0000156 pat = self.sentence_end_re
Greg Ward00935822002-06-07 21:43:37 +0000157 while i < len(chunks)-1:
Greg Ward9b4864e2002-06-07 22:04:15 +0000158 if chunks[i+1] == " " and pat.search(chunks[i]):
Greg Ward00935822002-06-07 21:43:37 +0000159 chunks[i+1] = " "
160 i += 2
161 else:
162 i += 1
163
Raymond Hettinger8bfa8932005-07-15 06:53:35 +0000164 def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
Greg Ward00935822002-06-07 21:43:37 +0000165 """_handle_long_word(chunks : [string],
166 cur_line : [string],
Greg Ward62080be2002-06-10 21:37:12 +0000167 cur_len : int, width : int)
Greg Ward00935822002-06-07 21:43:37 +0000168
169 Handle a chunk of text (most likely a word, not whitespace) that
170 is too long to fit in any line.
171 """
Raymond Hettingerc11dbcd2003-08-30 14:43:55 +0000172 space_left = max(width - cur_len, 1)
Greg Ward00935822002-06-07 21:43:37 +0000173
174 # If we're allowed to break long words, then do so: put as much
175 # of the next chunk onto the current line as will fit.
176 if self.break_long_words:
Raymond Hettinger8bfa8932005-07-15 06:53:35 +0000177 cur_line.append(reversed_chunks[-1][:space_left])
178 reversed_chunks[-1] = reversed_chunks[-1][space_left:]
Greg Ward00935822002-06-07 21:43:37 +0000179
180 # Otherwise, we have to preserve the long word intact. Only add
181 # it to the current line if there's nothing already there --
182 # that minimizes how much we violate the width constraint.
183 elif not cur_line:
Raymond Hettinger8bfa8932005-07-15 06:53:35 +0000184 cur_line.append(reversed_chunks.pop())
Greg Ward00935822002-06-07 21:43:37 +0000185
186 # If we're not allowed to break long words, and there's already
187 # text on the current line, do nothing. Next time through the
188 # main loop of _wrap_chunks(), we'll wind up here again, but
189 # cur_len will be zero, so the next line will be entirely
190 # devoted to the long word that we can't handle right now.
191
Greg Wardd34c9592002-06-10 20:26:02 +0000192 def _wrap_chunks(self, chunks):
193 """_wrap_chunks(chunks : [string]) -> [string]
Greg Ward00935822002-06-07 21:43:37 +0000194
195 Wrap a sequence of text chunks and return a list of lines of
Greg Wardd34c9592002-06-10 20:26:02 +0000196 length 'self.width' or less. (If 'break_long_words' is false,
197 some lines may be longer than this.) Chunks correspond roughly
198 to words and the whitespace between them: each chunk is
199 indivisible (modulo 'break_long_words'), but a line break can
200 come between any two chunks. Chunks should not have internal
201 whitespace; ie. a chunk is either all whitespace or a "word".
202 Whitespace chunks will be removed from the beginning and end of
203 lines, but apart from that whitespace is preserved.
Greg Ward00935822002-06-07 21:43:37 +0000204 """
205 lines = []
Greg Ward21820cd2003-05-07 00:55:35 +0000206 if self.width <= 0:
207 raise ValueError("invalid width %r (must be > 0)" % self.width)
Greg Ward00935822002-06-07 21:43:37 +0000208
Raymond Hettinger8bfa8932005-07-15 06:53:35 +0000209 # Arrange in reverse order so items can be efficiently popped
210 # from a stack of chucks.
211 chunks.reverse()
212
Greg Ward00935822002-06-07 21:43:37 +0000213 while chunks:
214
Greg Ward62080be2002-06-10 21:37:12 +0000215 # Start the list of chunks that will make up the current line.
216 # cur_len is just the length of all the chunks in cur_line.
217 cur_line = []
218 cur_len = 0
219
220 # Figure out which static string will prefix this line.
221 if lines:
222 indent = self.subsequent_indent
223 else:
224 indent = self.initial_indent
225
226 # Maximum width for this line.
227 width = self.width - len(indent)
Greg Ward00935822002-06-07 21:43:37 +0000228
Greg Wardab73d462002-12-09 16:26:05 +0000229 # First chunk on line is whitespace -- drop it, unless this
230 # is the very beginning of the text (ie. no lines started yet).
Raymond Hettinger8bfa8932005-07-15 06:53:35 +0000231 if chunks[-1].strip() == '' and lines:
232 del chunks[-1]
Greg Ward00935822002-06-07 21:43:37 +0000233
234 while chunks:
Raymond Hettinger8bfa8932005-07-15 06:53:35 +0000235 l = len(chunks[-1])
Greg Ward00935822002-06-07 21:43:37 +0000236
237 # Can at least squeeze this chunk onto the current line.
238 if cur_len + l <= width:
Raymond Hettinger8bfa8932005-07-15 06:53:35 +0000239 cur_line.append(chunks.pop())
Greg Ward00935822002-06-07 21:43:37 +0000240 cur_len += l
241
242 # Nope, this line is full.
243 else:
244 break
245
246 # The current line is full, and the next chunk is too big to
Tim Petersc411dba2002-07-16 21:35:23 +0000247 # fit on *any* line (not just this one).
Raymond Hettinger8bfa8932005-07-15 06:53:35 +0000248 if chunks and len(chunks[-1]) > width:
Greg Ward62080be2002-06-10 21:37:12 +0000249 self._handle_long_word(chunks, cur_line, cur_len, width)
Greg Ward00935822002-06-07 21:43:37 +0000250
251 # If the last chunk on this line is all whitespace, drop it.
252 if cur_line and cur_line[-1].strip() == '':
253 del cur_line[-1]
254
255 # Convert current line back to a string and store it in list
256 # of all lines (return value).
257 if cur_line:
Greg Ward62080be2002-06-10 21:37:12 +0000258 lines.append(indent + ''.join(cur_line))
Greg Ward00935822002-06-07 21:43:37 +0000259
260 return lines
261
262
263 # -- Public interface ----------------------------------------------
264
Greg Wardd34c9592002-06-10 20:26:02 +0000265 def wrap(self, text):
266 """wrap(text : string) -> [string]
Greg Ward00935822002-06-07 21:43:37 +0000267
Greg Warde807e572002-07-04 14:51:49 +0000268 Reformat the single paragraph in 'text' so it fits in lines of
269 no more than 'self.width' columns, and return a list of wrapped
270 lines. Tabs in 'text' are expanded with string.expandtabs(),
271 and all other whitespace characters (including newline) are
272 converted to space.
Greg Ward00935822002-06-07 21:43:37 +0000273 """
274 text = self._munge_whitespace(text)
Greg Ward00935822002-06-07 21:43:37 +0000275 chunks = self._split(text)
Greg Ward62e4f3b2002-06-07 21:56:16 +0000276 if self.fix_sentence_endings:
277 self._fix_sentence_endings(chunks)
Greg Wardd34c9592002-06-10 20:26:02 +0000278 return self._wrap_chunks(chunks)
Greg Ward00935822002-06-07 21:43:37 +0000279
Greg Ward62080be2002-06-10 21:37:12 +0000280 def fill(self, text):
281 """fill(text : string) -> string
Greg Ward00935822002-06-07 21:43:37 +0000282
Greg Warde807e572002-07-04 14:51:49 +0000283 Reformat the single paragraph in 'text' to fit in lines of no
284 more than 'self.width' columns, and return a new string
285 containing the entire wrapped paragraph.
Greg Ward00935822002-06-07 21:43:37 +0000286 """
Greg Ward62080be2002-06-10 21:37:12 +0000287 return "\n".join(self.wrap(text))
Greg Ward00935822002-06-07 21:43:37 +0000288
289
Greg Warde807e572002-07-04 14:51:49 +0000290# -- Convenience interface ---------------------------------------------
Greg Ward00935822002-06-07 21:43:37 +0000291
Greg Wardcf02ac62002-06-10 20:36:07 +0000292def wrap(text, width=70, **kwargs):
Greg Warde807e572002-07-04 14:51:49 +0000293 """Wrap a single paragraph of text, returning a list of wrapped lines.
294
295 Reformat the single paragraph in 'text' so it fits in lines of no
296 more than 'width' columns, and return a list of wrapped lines. By
297 default, tabs in 'text' are expanded with string.expandtabs(), and
298 all other whitespace characters (including newline) are converted to
299 space. See TextWrapper class for available keyword args to customize
300 wrapping behaviour.
301 """
Greg Wardcf02ac62002-06-10 20:36:07 +0000302 w = TextWrapper(width=width, **kwargs)
303 return w.wrap(text)
Greg Ward00935822002-06-07 21:43:37 +0000304
Greg Ward62080be2002-06-10 21:37:12 +0000305def fill(text, width=70, **kwargs):
Greg Warde807e572002-07-04 14:51:49 +0000306 """Fill a single paragraph of text, returning a new string.
307
308 Reformat the single paragraph in 'text' to fit in lines of no more
309 than 'width' columns, and return a new string containing the entire
310 wrapped paragraph. As with wrap(), tabs are expanded and other
311 whitespace characters converted to space. See TextWrapper class for
312 available keyword args to customize wrapping behaviour.
313 """
Greg Wardcf02ac62002-06-10 20:36:07 +0000314 w = TextWrapper(width=width, **kwargs)
Greg Ward62080be2002-06-10 21:37:12 +0000315 return w.fill(text)
Greg Ward478cd482003-05-08 01:58:05 +0000316
317
318# -- Loosely related functionality -------------------------------------
319
Greg Ward7f547402006-06-11 00:40:49 +0000320_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
321_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
322
Greg Ward478cd482003-05-08 01:58:05 +0000323def dedent(text):
Greg Ward7f547402006-06-11 00:40:49 +0000324 """Remove any common leading whitespace from every line in `text`.
Greg Ward478cd482003-05-08 01:58:05 +0000325
Greg Ward7f547402006-06-11 00:40:49 +0000326 This can be used to make triple-quoted strings line up with the left
327 edge of the display, while still presenting them in the source code
328 in indented form.
Greg Ward478cd482003-05-08 01:58:05 +0000329
Greg Ward7f547402006-06-11 00:40:49 +0000330 Note that tabs and spaces are both treated as whitespace, but they
331 are not equal: the lines " hello" and "\thello" are
332 considered to have no common leading whitespace. (This behaviour is
333 new in Python 2.5; older versions of this module incorrectly
334 expanded tabs before searching for common leading whitespace.)
Greg Ward478cd482003-05-08 01:58:05 +0000335 """
Greg Ward7f547402006-06-11 00:40:49 +0000336 # Look for the longest leading string of spaces and tabs common to
337 # all lines.
Greg Ward478cd482003-05-08 01:58:05 +0000338 margin = None
Greg Ward7f547402006-06-11 00:40:49 +0000339 text = _whitespace_only_re.sub('', text)
340 indents = _leading_whitespace_re.findall(text)
341 for indent in indents:
Greg Ward478cd482003-05-08 01:58:05 +0000342 if margin is None:
343 margin = indent
Greg Ward7f547402006-06-11 00:40:49 +0000344
345 # Current line more deeply indented than previous winner:
346 # no change (previous winner is still on top).
Tim Peters4f96f1f2006-06-11 19:42:51 +0000347 elif indent.startswith(margin):
348 pass
Greg Ward7f547402006-06-11 00:40:49 +0000349
350 # Current line consistent with and no deeper than previous winner:
351 # it's the new winner.
Tim Peters4f96f1f2006-06-11 19:42:51 +0000352 elif margin.startswith(indent):
353 margin = indent
Greg Ward7f547402006-06-11 00:40:49 +0000354
355 # Current line and previous winner have no common whitespace:
356 # there is no margin.
Greg Ward478cd482003-05-08 01:58:05 +0000357 else:
Greg Ward7f547402006-06-11 00:40:49 +0000358 margin = ""
359 break
Greg Ward478cd482003-05-08 01:58:05 +0000360
Greg Ward7f547402006-06-11 00:40:49 +0000361 # sanity check (testing/debugging only)
362 if 0 and margin:
363 for line in text.split("\n"):
364 assert not line or line.startswith(margin), \
365 "line = %r, margin = %r" % (line, margin)
Greg Ward478cd482003-05-08 01:58:05 +0000366
Greg Ward7f547402006-06-11 00:40:49 +0000367 if margin:
368 text = re.sub(r'(?m)^' + margin, '', text)
369 return text
370
371if __name__ == "__main__":
372 #print dedent("\tfoo\n\tbar")
373 #print dedent(" \thello there\n \t how are you?")
374 print dedent("Hello there.\n This is indented.")