blob: 552fef6415873cc7dfae81b253406e023623a22b [file] [log] [blame]
Greg Ward00935822002-06-07 21:43:37 +00001"""
2Utilities for wrapping text strings and filling text paragraphs.
3"""
4
Greg Ward698d9f02002-06-07 22:40:23 +00005# Copyright (C) 2001 Gregory P. Ward.
6# Copyright (C) 2002 Python Software Foundation.
7# Written by Greg Ward <gward@python.net>
8
Greg Ward00935822002-06-07 21:43:37 +00009__revision__ = "$Id$"
10
11import string, re
12
Greg Ward00935822002-06-07 21:43:37 +000013class TextWrapper:
14 """
15 Object for wrapping/filling text. The public interface consists of
16 the wrap() and fill() methods; the other methods are just there for
17 subclasses to override in order to tweak the default behaviour.
18 If you want to completely replace the main wrapping algorithm,
19 you'll probably have to override _wrap_chunks().
20
Greg Wardd34c9592002-06-10 20:26:02 +000021 Several instance attributes control various aspects of wrapping:
22 width (default: 70)
23 the maximum width of wrapped lines (unless break_long_words
24 is false)
Greg Ward62e4f3b2002-06-07 21:56:16 +000025 expand_tabs (default: true)
26 Expand tabs in input text to spaces before further processing.
27 Each tab will become 1 .. 8 spaces, depending on its position in
28 its line. If false, each tab is treated as a single character.
29 replace_whitespace (default: true)
30 Replace all whitespace characters in the input text by spaces
31 after tab expansion. Note that if expand_tabs is false and
32 replace_whitespace is true, every tab will be converted to a
33 single space!
34 fix_sentence_endings (default: false)
35 Ensure that sentence-ending punctuation is always followed
36 by two spaces. Off by default becaus the algorithm is
37 (unavoidably) imperfect.
38 break_long_words (default: true)
Greg Wardd34c9592002-06-10 20:26:02 +000039 Break words longer than 'width'. If false, those words will not
40 be broken, and some lines might be longer than 'width'.
Greg Ward00935822002-06-07 21:43:37 +000041 """
42
43 whitespace_trans = string.maketrans(string.whitespace,
44 ' ' * len(string.whitespace))
45
46 # This funky little regex is just the trick for splitting
47 # text up into word-wrappable chunks. E.g.
48 # "Hello there -- you goof-ball, use the -b option!"
49 # splits into
50 # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
51 # (after stripping out empty strings).
52 wordsep_re = re.compile(r'(\s+|' # any whitespace
53 r'\w{2,}-(?=\w{2,})|' # hyphenated words
54 r'(?<=\w)-{2,}(?=\w))') # em-dash
55
Greg Ward9b4864e2002-06-07 22:04:15 +000056 # XXX will there be a locale-or-charset-aware version of
57 # string.lowercase in 2.3?
58 sentence_end_re = re.compile(r'[%s]' # lowercase letter
59 r'[\.\!\?]' # sentence-ending punct.
60 r'[\"\']?' # optional end-of-quote
61 % string.lowercase)
Greg Ward62e4f3b2002-06-07 21:56:16 +000062
Greg Ward00935822002-06-07 21:43:37 +000063
Greg Ward47df99d2002-06-09 00:22:07 +000064 def __init__ (self,
Greg Wardd34c9592002-06-10 20:26:02 +000065 width=70,
Greg Ward47df99d2002-06-09 00:22:07 +000066 expand_tabs=True,
67 replace_whitespace=True,
68 fix_sentence_endings=False,
69 break_long_words=True):
Greg Wardd34c9592002-06-10 20:26:02 +000070 self.width = width
Greg Ward47df99d2002-06-09 00:22:07 +000071 self.expand_tabs = expand_tabs
72 self.replace_whitespace = replace_whitespace
73 self.fix_sentence_endings = fix_sentence_endings
74 self.break_long_words = break_long_words
Greg Ward00935822002-06-07 21:43:37 +000075
76
77 # -- Private methods -----------------------------------------------
78 # (possibly useful for subclasses to override)
79
Greg Wardcb320eb2002-06-07 22:32:15 +000080 def _munge_whitespace(self, text):
Greg Ward00935822002-06-07 21:43:37 +000081 """_munge_whitespace(text : string) -> string
82
83 Munge whitespace in text: expand tabs and convert all other
84 whitespace characters to spaces. Eg. " foo\tbar\n\nbaz"
85 becomes " foo bar baz".
86 """
87 if self.expand_tabs:
88 text = text.expandtabs()
89 if self.replace_whitespace:
90 text = text.translate(self.whitespace_trans)
91 return text
92
93
Greg Wardcb320eb2002-06-07 22:32:15 +000094 def _split(self, text):
Greg Ward00935822002-06-07 21:43:37 +000095 """_split(text : string) -> [string]
96
97 Split the text to wrap into indivisible chunks. Chunks are
98 not quite the same as words; see wrap_chunks() for full
99 details. As an example, the text
100 Look, goof-ball -- use the -b option!
101 breaks into the following chunks:
102 'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
103 'use', ' ', 'the', ' ', '-b', ' ', 'option!'
104 """
105 chunks = self.wordsep_re.split(text)
106 chunks = filter(None, chunks)
107 return chunks
108
Greg Wardcb320eb2002-06-07 22:32:15 +0000109 def _fix_sentence_endings(self, chunks):
Greg Ward00935822002-06-07 21:43:37 +0000110 """_fix_sentence_endings(chunks : [string])
111
112 Correct for sentence endings buried in 'chunks'. Eg. when the
113 original text contains "... foo.\nBar ...", munge_whitespace()
114 and split() will convert that to [..., "foo.", " ", "Bar", ...]
115 which has one too few spaces; this method simply changes the one
116 space to two.
117 """
118 i = 0
Greg Ward9b4864e2002-06-07 22:04:15 +0000119 pat = self.sentence_end_re
Greg Ward00935822002-06-07 21:43:37 +0000120 while i < len(chunks)-1:
Greg Ward9b4864e2002-06-07 22:04:15 +0000121 if chunks[i+1] == " " and pat.search(chunks[i]):
Greg Ward00935822002-06-07 21:43:37 +0000122 chunks[i+1] = " "
123 i += 2
124 else:
125 i += 1
126
Greg Wardd34c9592002-06-10 20:26:02 +0000127 def _handle_long_word(self, chunks, cur_line, cur_len):
Greg Ward00935822002-06-07 21:43:37 +0000128 """_handle_long_word(chunks : [string],
129 cur_line : [string],
Greg Wardd34c9592002-06-10 20:26:02 +0000130 cur_len : int)
Greg Ward00935822002-06-07 21:43:37 +0000131
132 Handle a chunk of text (most likely a word, not whitespace) that
133 is too long to fit in any line.
134 """
Greg Wardd34c9592002-06-10 20:26:02 +0000135 space_left = self.width - cur_len
Greg Ward00935822002-06-07 21:43:37 +0000136
137 # If we're allowed to break long words, then do so: put as much
138 # of the next chunk onto the current line as will fit.
139 if self.break_long_words:
140 cur_line.append(chunks[0][0:space_left])
141 chunks[0] = chunks[0][space_left:]
142
143 # Otherwise, we have to preserve the long word intact. Only add
144 # it to the current line if there's nothing already there --
145 # that minimizes how much we violate the width constraint.
146 elif not cur_line:
147 cur_line.append(chunks.pop(0))
148
149 # If we're not allowed to break long words, and there's already
150 # text on the current line, do nothing. Next time through the
151 # main loop of _wrap_chunks(), we'll wind up here again, but
152 # cur_len will be zero, so the next line will be entirely
153 # devoted to the long word that we can't handle right now.
154
Greg Wardd34c9592002-06-10 20:26:02 +0000155 def _wrap_chunks(self, chunks):
156 """_wrap_chunks(chunks : [string]) -> [string]
Greg Ward00935822002-06-07 21:43:37 +0000157
158 Wrap a sequence of text chunks and return a list of lines of
Greg Wardd34c9592002-06-10 20:26:02 +0000159 length 'self.width' or less. (If 'break_long_words' is false,
160 some lines may be longer than this.) Chunks correspond roughly
161 to words and the whitespace between them: each chunk is
162 indivisible (modulo 'break_long_words'), but a line break can
163 come between any two chunks. Chunks should not have internal
164 whitespace; ie. a chunk is either all whitespace or a "word".
165 Whitespace chunks will be removed from the beginning and end of
166 lines, but apart from that whitespace is preserved.
Greg Ward00935822002-06-07 21:43:37 +0000167 """
168 lines = []
Greg Wardd34c9592002-06-10 20:26:02 +0000169 width = self.width
Greg Ward00935822002-06-07 21:43:37 +0000170
171 while chunks:
172
173 cur_line = [] # list of chunks (to-be-joined)
174 cur_len = 0 # length of current line
175
176 # First chunk on line is whitespace -- drop it.
177 if chunks[0].strip() == '':
178 del chunks[0]
179
180 while chunks:
181 l = len(chunks[0])
182
183 # Can at least squeeze this chunk onto the current line.
184 if cur_len + l <= width:
185 cur_line.append(chunks.pop(0))
186 cur_len += l
187
188 # Nope, this line is full.
189 else:
190 break
191
192 # The current line is full, and the next chunk is too big to
193 # fit on *any* line (not just this one).
194 if chunks and len(chunks[0]) > width:
Greg Wardd34c9592002-06-10 20:26:02 +0000195 self._handle_long_word(chunks, cur_line, cur_len)
Greg Ward00935822002-06-07 21:43:37 +0000196
197 # If the last chunk on this line is all whitespace, drop it.
198 if cur_line and cur_line[-1].strip() == '':
199 del cur_line[-1]
200
201 # Convert current line back to a string and store it in list
202 # of all lines (return value).
203 if cur_line:
204 lines.append(''.join(cur_line))
205
206 return lines
207
208
209 # -- Public interface ----------------------------------------------
210
Greg Wardd34c9592002-06-10 20:26:02 +0000211 def wrap(self, text):
212 """wrap(text : string) -> [string]
Greg Ward00935822002-06-07 21:43:37 +0000213
Greg Wardd34c9592002-06-10 20:26:02 +0000214 Split 'text' into multiple lines of no more than 'self.width'
Greg Ward00935822002-06-07 21:43:37 +0000215 characters each, and return the list of strings that results.
216 Tabs in 'text' are expanded with string.expandtabs(), and all
217 other whitespace characters (including newline) are converted to
218 space.
219 """
220 text = self._munge_whitespace(text)
Greg Wardd34c9592002-06-10 20:26:02 +0000221 if len(text) <= self.width:
Greg Ward00935822002-06-07 21:43:37 +0000222 return [text]
223 chunks = self._split(text)
Greg Ward62e4f3b2002-06-07 21:56:16 +0000224 if self.fix_sentence_endings:
225 self._fix_sentence_endings(chunks)
Greg Wardd34c9592002-06-10 20:26:02 +0000226 return self._wrap_chunks(chunks)
Greg Ward00935822002-06-07 21:43:37 +0000227
Greg Wardd34c9592002-06-10 20:26:02 +0000228 def fill(self, text, initial_tab="", subsequent_tab=""):
Greg Ward00935822002-06-07 21:43:37 +0000229 """fill(text : string,
Greg Ward00935822002-06-07 21:43:37 +0000230 initial_tab : string = "",
231 subsequent_tab : string = "")
232 -> string
233
234 Reformat the paragraph in 'text' to fit in lines of no more than
235 'width' columns. The first line is prefixed with 'initial_tab',
236 and subsequent lines are prefixed with 'subsequent_tab'; the
237 lengths of the tab strings are accounted for when wrapping lines
238 to fit in 'width' columns.
239 """
Greg Wardd34c9592002-06-10 20:26:02 +0000240 lines = self.wrap(text)
Greg Ward00935822002-06-07 21:43:37 +0000241 sep = "\n" + subsequent_tab
242 return initial_tab + sep.join(lines)
243
244
245# Convenience interface
246
Greg Wardcb320eb2002-06-07 22:32:15 +0000247def wrap(text, width):
Greg Wardd34c9592002-06-10 20:26:02 +0000248 return TextWrapper(width=width).wrap(text)
Greg Ward00935822002-06-07 21:43:37 +0000249
Greg Wardcb320eb2002-06-07 22:32:15 +0000250def fill(text, width, initial_tab="", subsequent_tab=""):
Greg Wardd34c9592002-06-10 20:26:02 +0000251 return TextWrapper(width=width).fill(text, initial_tab, subsequent_tab)