blob: ffbb9d16341014544757c74cc5df3d2808735e19 [file] [log] [blame]
Greg Warde807e572002-07-04 14:51:49 +00001"""Text wrapping and filling.
Greg Ward00935822002-06-07 21:43:37 +00002"""
3
Greg Ward78cc0512002-10-13 19:23:18 +00004# Copyright (C) 1999-2001 Gregory P. Ward.
Greg Ward523008c2003-06-15 15:37:18 +00005# Copyright (C) 2002, 2003 Python Software Foundation.
Greg Ward698d9f02002-06-07 22:40:23 +00006# Written by Greg Ward <gward@python.net>
7
Greg Ward00935822002-06-07 21:43:37 +00008__revision__ = "$Id$"
9
10import string, re
11
Greg Ward523008c2003-06-15 15:37:18 +000012# Do the right thing with boolean values for all known Python versions
13# (so this module can be copied to projects that don't depend on Python
14# 2.3, e.g. Optik and Docutils).
15try:
16 True, False
17except NameError:
18 (True, False) = (1, 0)
19
Greg Ward4c6c9c42003-02-03 14:46:57 +000020__all__ = ['TextWrapper', 'wrap', 'fill']
21
Greg Wardafd44de2002-12-12 17:24:35 +000022# Hardcode the recognized whitespace characters to the US-ASCII
23# whitespace characters. The main reason for doing this is that in
24# ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales
25# that character winds up in string.whitespace. Respecting
26# string.whitespace in those cases would 1) make textwrap treat 0xa0 the
27# same as any other whitespace char, which is clearly wrong (it's a
28# *non-breaking* space), 2) possibly cause problems with Unicode,
29# since 0xa0 is not in range(128).
Greg Ward4c6c9c42003-02-03 14:46:57 +000030_whitespace = '\t\n\x0b\x0c\r '
Greg Wardafd44de2002-12-12 17:24:35 +000031
Greg Ward00935822002-06-07 21:43:37 +000032class TextWrapper:
33 """
34 Object for wrapping/filling text. The public interface consists of
35 the wrap() and fill() methods; the other methods are just there for
36 subclasses to override in order to tweak the default behaviour.
37 If you want to completely replace the main wrapping algorithm,
38 you'll probably have to override _wrap_chunks().
39
Greg Wardd34c9592002-06-10 20:26:02 +000040 Several instance attributes control various aspects of wrapping:
41 width (default: 70)
42 the maximum width of wrapped lines (unless break_long_words
43 is false)
Greg Ward62080be2002-06-10 21:37:12 +000044 initial_indent (default: "")
45 string that will be prepended to the first line of wrapped
46 output. Counts towards the line's width.
47 subsequent_indent (default: "")
48 string that will be prepended to all lines save the first
49 of wrapped output; also counts towards each line's width.
Greg Ward62e4f3b2002-06-07 21:56:16 +000050 expand_tabs (default: true)
51 Expand tabs in input text to spaces before further processing.
52 Each tab will become 1 .. 8 spaces, depending on its position in
53 its line. If false, each tab is treated as a single character.
54 replace_whitespace (default: true)
55 Replace all whitespace characters in the input text by spaces
56 after tab expansion. Note that if expand_tabs is false and
57 replace_whitespace is true, every tab will be converted to a
58 single space!
59 fix_sentence_endings (default: false)
60 Ensure that sentence-ending punctuation is always followed
Andrew M. Kuchlinga2ecabe2003-02-14 01:14:15 +000061 by two spaces. Off by default because the algorithm is
Greg Ward62e4f3b2002-06-07 21:56:16 +000062 (unavoidably) imperfect.
63 break_long_words (default: true)
Greg Wardd34c9592002-06-10 20:26:02 +000064 Break words longer than 'width'. If false, those words will not
65 be broken, and some lines might be longer than 'width'.
Georg Brandl9e6b4702007-03-13 18:15:41 +000066 drop_whitespace (default: true)
67 Drop leading and trailing whitespace from lines.
Greg Ward00935822002-06-07 21:43:37 +000068 """
69
Greg Ward4c6c9c42003-02-03 14:46:57 +000070 whitespace_trans = string.maketrans(_whitespace, ' ' * len(_whitespace))
Greg Ward00935822002-06-07 21:43:37 +000071
Greg Ward2e745412002-12-09 16:23:08 +000072 unicode_whitespace_trans = {}
Greg Ward0e88c9f2002-12-11 13:54:20 +000073 uspace = ord(u' ')
Greg Ward4c6c9c42003-02-03 14:46:57 +000074 for x in map(ord, _whitespace):
Greg Ward0e88c9f2002-12-11 13:54:20 +000075 unicode_whitespace_trans[x] = uspace
Greg Ward2e745412002-12-09 16:23:08 +000076
Tim Petersc411dba2002-07-16 21:35:23 +000077 # This funky little regex is just the trick for splitting
Greg Ward00935822002-06-07 21:43:37 +000078 # text up into word-wrappable chunks. E.g.
79 # "Hello there -- you goof-ball, use the -b option!"
80 # splits into
81 # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
82 # (after stripping out empty strings).
Greg Ward40407942005-03-05 02:53:17 +000083 wordsep_re = re.compile(
84 r'(\s+|' # any whitespace
85 r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words
86 r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash
Greg Ward00935822002-06-07 21:43:37 +000087
Greg Ward61864102004-06-03 01:59:41 +000088 # XXX this is not locale- or charset-aware -- string.lowercase
89 # is US-ASCII only (and therefore English-only)
Greg Ward9b4864e2002-06-07 22:04:15 +000090 sentence_end_re = re.compile(r'[%s]' # lowercase letter
91 r'[\.\!\?]' # sentence-ending punct.
92 r'[\"\']?' # optional end-of-quote
Mark Dickinsonfe536f52008-04-25 16:59:09 +000093 r'\Z' # end of chunk
Greg Ward9b4864e2002-06-07 22:04:15 +000094 % string.lowercase)
Greg Ward62e4f3b2002-06-07 21:56:16 +000095
Greg Ward00935822002-06-07 21:43:37 +000096
Greg Wardf0ba7642004-05-13 01:53:10 +000097 def __init__(self,
98 width=70,
99 initial_indent="",
100 subsequent_indent="",
101 expand_tabs=True,
102 replace_whitespace=True,
103 fix_sentence_endings=False,
Georg Brandl9e6b4702007-03-13 18:15:41 +0000104 break_long_words=True,
105 drop_whitespace=True):
Greg Wardd34c9592002-06-10 20:26:02 +0000106 self.width = width
Greg Ward62080be2002-06-10 21:37:12 +0000107 self.initial_indent = initial_indent
108 self.subsequent_indent = subsequent_indent
Greg Ward47df99d2002-06-09 00:22:07 +0000109 self.expand_tabs = expand_tabs
110 self.replace_whitespace = replace_whitespace
111 self.fix_sentence_endings = fix_sentence_endings
112 self.break_long_words = break_long_words
Georg Brandl9e6b4702007-03-13 18:15:41 +0000113 self.drop_whitespace = drop_whitespace
Tim Petersc411dba2002-07-16 21:35:23 +0000114
Greg Ward00935822002-06-07 21:43:37 +0000115
116 # -- Private methods -----------------------------------------------
117 # (possibly useful for subclasses to override)
118
Greg Wardcb320eb2002-06-07 22:32:15 +0000119 def _munge_whitespace(self, text):
Greg Ward00935822002-06-07 21:43:37 +0000120 """_munge_whitespace(text : string) -> string
121
122 Munge whitespace in text: expand tabs and convert all other
123 whitespace characters to spaces. Eg. " foo\tbar\n\nbaz"
124 becomes " foo bar baz".
125 """
126 if self.expand_tabs:
127 text = text.expandtabs()
128 if self.replace_whitespace:
Greg Ward2e745412002-12-09 16:23:08 +0000129 if isinstance(text, str):
130 text = text.translate(self.whitespace_trans)
131 elif isinstance(text, unicode):
132 text = text.translate(self.unicode_whitespace_trans)
Greg Ward00935822002-06-07 21:43:37 +0000133 return text
134
135
Greg Wardcb320eb2002-06-07 22:32:15 +0000136 def _split(self, text):
Greg Ward00935822002-06-07 21:43:37 +0000137 """_split(text : string) -> [string]
138
139 Split the text to wrap into indivisible chunks. Chunks are
140 not quite the same as words; see wrap_chunks() for full
141 details. As an example, the text
142 Look, goof-ball -- use the -b option!
143 breaks into the following chunks:
144 'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
145 'use', ' ', 'the', ' ', '-b', ' ', 'option!'
146 """
147 chunks = self.wordsep_re.split(text)
Georg Brandl9e6b4702007-03-13 18:15:41 +0000148 chunks = filter(None, chunks) # remove empty chunks
Greg Ward00935822002-06-07 21:43:37 +0000149 return chunks
150
Greg Wardcb320eb2002-06-07 22:32:15 +0000151 def _fix_sentence_endings(self, chunks):
Greg Ward00935822002-06-07 21:43:37 +0000152 """_fix_sentence_endings(chunks : [string])
153
154 Correct for sentence endings buried in 'chunks'. Eg. when the
155 original text contains "... foo.\nBar ...", munge_whitespace()
156 and split() will convert that to [..., "foo.", " ", "Bar", ...]
157 which has one too few spaces; this method simply changes the one
158 space to two.
159 """
160 i = 0
Greg Ward9b4864e2002-06-07 22:04:15 +0000161 pat = self.sentence_end_re
Greg Ward00935822002-06-07 21:43:37 +0000162 while i < len(chunks)-1:
Greg Ward9b4864e2002-06-07 22:04:15 +0000163 if chunks[i+1] == " " and pat.search(chunks[i]):
Greg Ward00935822002-06-07 21:43:37 +0000164 chunks[i+1] = " "
165 i += 2
166 else:
167 i += 1
168
Raymond Hettinger8bfa8932005-07-15 06:53:35 +0000169 def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
Greg Ward00935822002-06-07 21:43:37 +0000170 """_handle_long_word(chunks : [string],
171 cur_line : [string],
Greg Ward62080be2002-06-10 21:37:12 +0000172 cur_len : int, width : int)
Greg Ward00935822002-06-07 21:43:37 +0000173
174 Handle a chunk of text (most likely a word, not whitespace) that
175 is too long to fit in any line.
176 """
Georg Brandlc6fde722008-01-19 19:48:19 +0000177 # Figure out when indent is larger than the specified width, and make
178 # sure at least one character is stripped off on every pass
179 if width < 1:
180 space_left = 1
181 else:
182 space_left = width - cur_len
Greg Ward00935822002-06-07 21:43:37 +0000183
184 # If we're allowed to break long words, then do so: put as much
185 # of the next chunk onto the current line as will fit.
186 if self.break_long_words:
Raymond Hettinger8bfa8932005-07-15 06:53:35 +0000187 cur_line.append(reversed_chunks[-1][:space_left])
188 reversed_chunks[-1] = reversed_chunks[-1][space_left:]
Greg Ward00935822002-06-07 21:43:37 +0000189
190 # Otherwise, we have to preserve the long word intact. Only add
191 # it to the current line if there's nothing already there --
192 # that minimizes how much we violate the width constraint.
193 elif not cur_line:
Raymond Hettinger8bfa8932005-07-15 06:53:35 +0000194 cur_line.append(reversed_chunks.pop())
Greg Ward00935822002-06-07 21:43:37 +0000195
196 # If we're not allowed to break long words, and there's already
197 # text on the current line, do nothing. Next time through the
198 # main loop of _wrap_chunks(), we'll wind up here again, but
199 # cur_len will be zero, so the next line will be entirely
200 # devoted to the long word that we can't handle right now.
201
Greg Wardd34c9592002-06-10 20:26:02 +0000202 def _wrap_chunks(self, chunks):
203 """_wrap_chunks(chunks : [string]) -> [string]
Greg Ward00935822002-06-07 21:43:37 +0000204
205 Wrap a sequence of text chunks and return a list of lines of
Greg Wardd34c9592002-06-10 20:26:02 +0000206 length 'self.width' or less. (If 'break_long_words' is false,
207 some lines may be longer than this.) Chunks correspond roughly
208 to words and the whitespace between them: each chunk is
209 indivisible (modulo 'break_long_words'), but a line break can
210 come between any two chunks. Chunks should not have internal
211 whitespace; ie. a chunk is either all whitespace or a "word".
212 Whitespace chunks will be removed from the beginning and end of
213 lines, but apart from that whitespace is preserved.
Greg Ward00935822002-06-07 21:43:37 +0000214 """
215 lines = []
Greg Ward21820cd2003-05-07 00:55:35 +0000216 if self.width <= 0:
217 raise ValueError("invalid width %r (must be > 0)" % self.width)
Greg Ward00935822002-06-07 21:43:37 +0000218
Raymond Hettinger8bfa8932005-07-15 06:53:35 +0000219 # Arrange in reverse order so items can be efficiently popped
220 # from a stack of chucks.
221 chunks.reverse()
222
Greg Ward00935822002-06-07 21:43:37 +0000223 while chunks:
224
Greg Ward62080be2002-06-10 21:37:12 +0000225 # Start the list of chunks that will make up the current line.
226 # cur_len is just the length of all the chunks in cur_line.
227 cur_line = []
228 cur_len = 0
229
230 # Figure out which static string will prefix this line.
231 if lines:
232 indent = self.subsequent_indent
233 else:
234 indent = self.initial_indent
235
236 # Maximum width for this line.
237 width = self.width - len(indent)
Greg Ward00935822002-06-07 21:43:37 +0000238
Greg Wardab73d462002-12-09 16:26:05 +0000239 # First chunk on line is whitespace -- drop it, unless this
240 # is the very beginning of the text (ie. no lines started yet).
Georg Brandl9e6b4702007-03-13 18:15:41 +0000241 if self.drop_whitespace and chunks[-1].strip() == '' and lines:
Raymond Hettinger8bfa8932005-07-15 06:53:35 +0000242 del chunks[-1]
Greg Ward00935822002-06-07 21:43:37 +0000243
244 while chunks:
Raymond Hettinger8bfa8932005-07-15 06:53:35 +0000245 l = len(chunks[-1])
Greg Ward00935822002-06-07 21:43:37 +0000246
247 # Can at least squeeze this chunk onto the current line.
248 if cur_len + l <= width:
Raymond Hettinger8bfa8932005-07-15 06:53:35 +0000249 cur_line.append(chunks.pop())
Greg Ward00935822002-06-07 21:43:37 +0000250 cur_len += l
251
252 # Nope, this line is full.
253 else:
254 break
255
256 # The current line is full, and the next chunk is too big to
Tim Petersc411dba2002-07-16 21:35:23 +0000257 # fit on *any* line (not just this one).
Raymond Hettinger8bfa8932005-07-15 06:53:35 +0000258 if chunks and len(chunks[-1]) > width:
Greg Ward62080be2002-06-10 21:37:12 +0000259 self._handle_long_word(chunks, cur_line, cur_len, width)
Greg Ward00935822002-06-07 21:43:37 +0000260
261 # If the last chunk on this line is all whitespace, drop it.
Georg Brandl9e6b4702007-03-13 18:15:41 +0000262 if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
Greg Ward00935822002-06-07 21:43:37 +0000263 del cur_line[-1]
264
265 # Convert current line back to a string and store it in list
266 # of all lines (return value).
267 if cur_line:
Greg Ward62080be2002-06-10 21:37:12 +0000268 lines.append(indent + ''.join(cur_line))
Greg Ward00935822002-06-07 21:43:37 +0000269
270 return lines
271
272
273 # -- Public interface ----------------------------------------------
274
Greg Wardd34c9592002-06-10 20:26:02 +0000275 def wrap(self, text):
276 """wrap(text : string) -> [string]
Greg Ward00935822002-06-07 21:43:37 +0000277
Greg Warde807e572002-07-04 14:51:49 +0000278 Reformat the single paragraph in 'text' so it fits in lines of
279 no more than 'self.width' columns, and return a list of wrapped
280 lines. Tabs in 'text' are expanded with string.expandtabs(),
281 and all other whitespace characters (including newline) are
282 converted to space.
Greg Ward00935822002-06-07 21:43:37 +0000283 """
284 text = self._munge_whitespace(text)
Greg Ward00935822002-06-07 21:43:37 +0000285 chunks = self._split(text)
Greg Ward62e4f3b2002-06-07 21:56:16 +0000286 if self.fix_sentence_endings:
287 self._fix_sentence_endings(chunks)
Greg Wardd34c9592002-06-10 20:26:02 +0000288 return self._wrap_chunks(chunks)
Greg Ward00935822002-06-07 21:43:37 +0000289
Greg Ward62080be2002-06-10 21:37:12 +0000290 def fill(self, text):
291 """fill(text : string) -> string
Greg Ward00935822002-06-07 21:43:37 +0000292
Greg Warde807e572002-07-04 14:51:49 +0000293 Reformat the single paragraph in 'text' to fit in lines of no
294 more than 'self.width' columns, and return a new string
295 containing the entire wrapped paragraph.
Greg Ward00935822002-06-07 21:43:37 +0000296 """
Greg Ward62080be2002-06-10 21:37:12 +0000297 return "\n".join(self.wrap(text))
Greg Ward00935822002-06-07 21:43:37 +0000298
299
Greg Warde807e572002-07-04 14:51:49 +0000300# -- Convenience interface ---------------------------------------------
Greg Ward00935822002-06-07 21:43:37 +0000301
Greg Wardcf02ac62002-06-10 20:36:07 +0000302def wrap(text, width=70, **kwargs):
Greg Warde807e572002-07-04 14:51:49 +0000303 """Wrap a single paragraph of text, returning a list of wrapped lines.
304
305 Reformat the single paragraph in 'text' so it fits in lines of no
306 more than 'width' columns, and return a list of wrapped lines. By
307 default, tabs in 'text' are expanded with string.expandtabs(), and
308 all other whitespace characters (including newline) are converted to
309 space. See TextWrapper class for available keyword args to customize
310 wrapping behaviour.
311 """
Greg Wardcf02ac62002-06-10 20:36:07 +0000312 w = TextWrapper(width=width, **kwargs)
313 return w.wrap(text)
Greg Ward00935822002-06-07 21:43:37 +0000314
Greg Ward62080be2002-06-10 21:37:12 +0000315def fill(text, width=70, **kwargs):
Greg Warde807e572002-07-04 14:51:49 +0000316 """Fill a single paragraph of text, returning a new string.
317
318 Reformat the single paragraph in 'text' to fit in lines of no more
319 than 'width' columns, and return a new string containing the entire
320 wrapped paragraph. As with wrap(), tabs are expanded and other
321 whitespace characters converted to space. See TextWrapper class for
322 available keyword args to customize wrapping behaviour.
323 """
Greg Wardcf02ac62002-06-10 20:36:07 +0000324 w = TextWrapper(width=width, **kwargs)
Greg Ward62080be2002-06-10 21:37:12 +0000325 return w.fill(text)
Greg Ward478cd482003-05-08 01:58:05 +0000326
327
328# -- Loosely related functionality -------------------------------------
329
Greg Ward7f547402006-06-11 00:40:49 +0000330_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
331_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
332
Greg Ward478cd482003-05-08 01:58:05 +0000333def dedent(text):
Greg Ward7f547402006-06-11 00:40:49 +0000334 """Remove any common leading whitespace from every line in `text`.
Greg Ward478cd482003-05-08 01:58:05 +0000335
Greg Ward7f547402006-06-11 00:40:49 +0000336 This can be used to make triple-quoted strings line up with the left
337 edge of the display, while still presenting them in the source code
338 in indented form.
Greg Ward478cd482003-05-08 01:58:05 +0000339
Greg Ward7f547402006-06-11 00:40:49 +0000340 Note that tabs and spaces are both treated as whitespace, but they
341 are not equal: the lines " hello" and "\thello" are
342 considered to have no common leading whitespace. (This behaviour is
343 new in Python 2.5; older versions of this module incorrectly
344 expanded tabs before searching for common leading whitespace.)
Greg Ward478cd482003-05-08 01:58:05 +0000345 """
Greg Ward7f547402006-06-11 00:40:49 +0000346 # Look for the longest leading string of spaces and tabs common to
347 # all lines.
Greg Ward478cd482003-05-08 01:58:05 +0000348 margin = None
Greg Ward7f547402006-06-11 00:40:49 +0000349 text = _whitespace_only_re.sub('', text)
350 indents = _leading_whitespace_re.findall(text)
351 for indent in indents:
Greg Ward478cd482003-05-08 01:58:05 +0000352 if margin is None:
353 margin = indent
Greg Ward7f547402006-06-11 00:40:49 +0000354
355 # Current line more deeply indented than previous winner:
356 # no change (previous winner is still on top).
Tim Peters4f96f1f2006-06-11 19:42:51 +0000357 elif indent.startswith(margin):
358 pass
Greg Ward7f547402006-06-11 00:40:49 +0000359
360 # Current line consistent with and no deeper than previous winner:
361 # it's the new winner.
Tim Peters4f96f1f2006-06-11 19:42:51 +0000362 elif margin.startswith(indent):
363 margin = indent
Greg Ward7f547402006-06-11 00:40:49 +0000364
365 # Current line and previous winner have no common whitespace:
366 # there is no margin.
Greg Ward478cd482003-05-08 01:58:05 +0000367 else:
Greg Ward7f547402006-06-11 00:40:49 +0000368 margin = ""
369 break
Greg Ward478cd482003-05-08 01:58:05 +0000370
Greg Ward7f547402006-06-11 00:40:49 +0000371 # sanity check (testing/debugging only)
372 if 0 and margin:
373 for line in text.split("\n"):
374 assert not line or line.startswith(margin), \
375 "line = %r, margin = %r" % (line, margin)
Greg Ward478cd482003-05-08 01:58:05 +0000376
Greg Ward7f547402006-06-11 00:40:49 +0000377 if margin:
378 text = re.sub(r'(?m)^' + margin, '', text)
379 return text
380
381if __name__ == "__main__":
382 #print dedent("\tfoo\n\tbar")
383 #print dedent(" \thello there\n \t how are you?")
384 print dedent("Hello there.\n This is indented.")