blob: ef6c372ce51301518282de14c62a0070173ac33d [file] [log] [blame]
Greg Ward00935822002-06-07 21:43:37 +00001"""
2Utilities for wrapping text strings and filling text paragraphs.
3"""
4
5__revision__ = "$Id$"
6
7import string, re
8
9
10# XXX is this going to be implemented properly somewhere in 2.3?
11def islower (c):
12 return c in string.lowercase
13
14
15class TextWrapper:
16 """
17 Object for wrapping/filling text. The public interface consists of
18 the wrap() and fill() methods; the other methods are just there for
19 subclasses to override in order to tweak the default behaviour.
20 If you want to completely replace the main wrapping algorithm,
21 you'll probably have to override _wrap_chunks().
22
23 Several instance attributes control various aspects of
24 wrapping:
Greg Ward62e4f3b2002-06-07 21:56:16 +000025 expand_tabs (default: true)
26 Expand tabs in input text to spaces before further processing.
27 Each tab will become 1 .. 8 spaces, depending on its position in
28 its line. If false, each tab is treated as a single character.
29 replace_whitespace (default: true)
30 Replace all whitespace characters in the input text by spaces
31 after tab expansion. Note that if expand_tabs is false and
32 replace_whitespace is true, every tab will be converted to a
33 single space!
34 fix_sentence_endings (default: false)
35 Ensure that sentence-ending punctuation is always followed
36 by two spaces. Off by default becaus the algorithm is
37 (unavoidably) imperfect.
38 break_long_words (default: true)
39 Break words longer than the line width constraint. If false,
40 those words will not be broken, and some lines might be longer
41 than the width constraint.
Greg Ward00935822002-06-07 21:43:37 +000042 """
43
44 whitespace_trans = string.maketrans(string.whitespace,
45 ' ' * len(string.whitespace))
46
47 # This funky little regex is just the trick for splitting
48 # text up into word-wrappable chunks. E.g.
49 # "Hello there -- you goof-ball, use the -b option!"
50 # splits into
51 # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
52 # (after stripping out empty strings).
53 wordsep_re = re.compile(r'(\s+|' # any whitespace
54 r'\w{2,}-(?=\w{2,})|' # hyphenated words
55 r'(?<=\w)-{2,}(?=\w))') # em-dash
56
Greg Ward62e4f3b2002-06-07 21:56:16 +000057 # Punctuation characters found at the end of a sentence.
58 sentence_end = ".?!"
59
Greg Ward00935822002-06-07 21:43:37 +000060
61 def __init__ (self):
62 self.expand_tabs = 1
63 self.replace_whitespace = 1
Greg Ward62e4f3b2002-06-07 21:56:16 +000064 self.fix_sentence_endings = 0
Greg Ward00935822002-06-07 21:43:37 +000065 self.break_long_words = 1
66
67
68 # -- Private methods -----------------------------------------------
69 # (possibly useful for subclasses to override)
70
71 def _munge_whitespace (self, text):
72 """_munge_whitespace(text : string) -> string
73
74 Munge whitespace in text: expand tabs and convert all other
75 whitespace characters to spaces. Eg. " foo\tbar\n\nbaz"
76 becomes " foo bar baz".
77 """
78 if self.expand_tabs:
79 text = text.expandtabs()
80 if self.replace_whitespace:
81 text = text.translate(self.whitespace_trans)
82 return text
83
84
85 def _split (self, text):
86 """_split(text : string) -> [string]
87
88 Split the text to wrap into indivisible chunks. Chunks are
89 not quite the same as words; see wrap_chunks() for full
90 details. As an example, the text
91 Look, goof-ball -- use the -b option!
92 breaks into the following chunks:
93 'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
94 'use', ' ', 'the', ' ', '-b', ' ', 'option!'
95 """
96 chunks = self.wordsep_re.split(text)
97 chunks = filter(None, chunks)
98 return chunks
99
100 def _fix_sentence_endings (self, chunks):
101 """_fix_sentence_endings(chunks : [string])
102
103 Correct for sentence endings buried in 'chunks'. Eg. when the
104 original text contains "... foo.\nBar ...", munge_whitespace()
105 and split() will convert that to [..., "foo.", " ", "Bar", ...]
106 which has one too few spaces; this method simply changes the one
107 space to two.
108 """
109 i = 0
Greg Ward62e4f3b2002-06-07 21:56:16 +0000110 punct = self.sentence_end
Greg Ward00935822002-06-07 21:43:37 +0000111 while i < len(chunks)-1:
112 # chunks[i] looks like the last word of a sentence,
113 # and it's followed by a single space.
Greg Ward62e4f3b2002-06-07 21:56:16 +0000114 if (chunks[i][-1] in punct and
Greg Ward00935822002-06-07 21:43:37 +0000115 chunks[i+1] == " " and
116 islower(chunks[i][-2])):
117 chunks[i+1] = " "
118 i += 2
119 else:
120 i += 1
121
122 def _handle_long_word (self, chunks, cur_line, cur_len, width):
123 """_handle_long_word(chunks : [string],
124 cur_line : [string],
125 cur_len : int, width : int)
126
127 Handle a chunk of text (most likely a word, not whitespace) that
128 is too long to fit in any line.
129 """
130 space_left = width - cur_len
131
132 # If we're allowed to break long words, then do so: put as much
133 # of the next chunk onto the current line as will fit.
134 if self.break_long_words:
135 cur_line.append(chunks[0][0:space_left])
136 chunks[0] = chunks[0][space_left:]
137
138 # Otherwise, we have to preserve the long word intact. Only add
139 # it to the current line if there's nothing already there --
140 # that minimizes how much we violate the width constraint.
141 elif not cur_line:
142 cur_line.append(chunks.pop(0))
143
144 # If we're not allowed to break long words, and there's already
145 # text on the current line, do nothing. Next time through the
146 # main loop of _wrap_chunks(), we'll wind up here again, but
147 # cur_len will be zero, so the next line will be entirely
148 # devoted to the long word that we can't handle right now.
149
150 def _wrap_chunks (self, chunks, width):
151 """_wrap_chunks(chunks : [string], width : int) -> [string]
152
153 Wrap a sequence of text chunks and return a list of lines of
154 length 'width' or less. (If 'break_long_words' is false, some
155 lines may be longer than 'width'.) Chunks correspond roughly to
156 words and the whitespace between them: each chunk is indivisible
157 (modulo 'break_long_words'), but a line break can come between
158 any two chunks. Chunks should not have internal whitespace;
159 ie. a chunk is either all whitespace or a "word". Whitespace
160 chunks will be removed from the beginning and end of lines, but
161 apart from that whitespace is preserved.
162 """
163 lines = []
164
165 while chunks:
166
167 cur_line = [] # list of chunks (to-be-joined)
168 cur_len = 0 # length of current line
169
170 # First chunk on line is whitespace -- drop it.
171 if chunks[0].strip() == '':
172 del chunks[0]
173
174 while chunks:
175 l = len(chunks[0])
176
177 # Can at least squeeze this chunk onto the current line.
178 if cur_len + l <= width:
179 cur_line.append(chunks.pop(0))
180 cur_len += l
181
182 # Nope, this line is full.
183 else:
184 break
185
186 # The current line is full, and the next chunk is too big to
187 # fit on *any* line (not just this one).
188 if chunks and len(chunks[0]) > width:
189 self._handle_long_word(chunks, cur_line, cur_len, width)
190
191 # If the last chunk on this line is all whitespace, drop it.
192 if cur_line and cur_line[-1].strip() == '':
193 del cur_line[-1]
194
195 # Convert current line back to a string and store it in list
196 # of all lines (return value).
197 if cur_line:
198 lines.append(''.join(cur_line))
199
200 return lines
201
202
203 # -- Public interface ----------------------------------------------
204
205 def wrap (self, text, width):
206 """wrap(text : string, width : int) -> [string]
207
208 Split 'text' into multiple lines of no more than 'width'
209 characters each, and return the list of strings that results.
210 Tabs in 'text' are expanded with string.expandtabs(), and all
211 other whitespace characters (including newline) are converted to
212 space.
213 """
214 text = self._munge_whitespace(text)
215 if len(text) <= width:
216 return [text]
217 chunks = self._split(text)
Greg Ward62e4f3b2002-06-07 21:56:16 +0000218 if self.fix_sentence_endings:
219 self._fix_sentence_endings(chunks)
Greg Ward00935822002-06-07 21:43:37 +0000220 return self._wrap_chunks(chunks, width)
221
222 def fill (self, text, width, initial_tab="", subsequent_tab=""):
223 """fill(text : string,
224 width : int,
225 initial_tab : string = "",
226 subsequent_tab : string = "")
227 -> string
228
229 Reformat the paragraph in 'text' to fit in lines of no more than
230 'width' columns. The first line is prefixed with 'initial_tab',
231 and subsequent lines are prefixed with 'subsequent_tab'; the
232 lengths of the tab strings are accounted for when wrapping lines
233 to fit in 'width' columns.
234 """
235 lines = self.wrap(text, width)
236 sep = "\n" + subsequent_tab
237 return initial_tab + sep.join(lines)
238
239
240# Convenience interface
241
242_wrapper = TextWrapper()
243
244def wrap (text, width):
245 return _wrapper.wrap(text, width)
246
247def fill (text, width, initial_tab="", subsequent_tab=""):
248 return _wrapper.fill(text, width, initial_tab, subsequent_tab)