blob: cdcfe9b28a0bf19624edf78d82407981909c6362 [file] [log] [blame]
Greg Ward00935822002-06-07 21:43:37 +00001"""
2Utilities for wrapping text strings and filling text paragraphs.
3"""
4
Greg Ward698d9f02002-06-07 22:40:23 +00005# Copyright (C) 2001 Gregory P. Ward.
6# Copyright (C) 2002 Python Software Foundation.
7# Written by Greg Ward <gward@python.net>
8
Greg Ward00935822002-06-07 21:43:37 +00009__revision__ = "$Id$"
10
11import string, re
12
Greg Ward00935822002-06-07 21:43:37 +000013class TextWrapper:
14 """
15 Object for wrapping/filling text. The public interface consists of
16 the wrap() and fill() methods; the other methods are just there for
17 subclasses to override in order to tweak the default behaviour.
18 If you want to completely replace the main wrapping algorithm,
19 you'll probably have to override _wrap_chunks().
20
Greg Ward70c726a2002-06-07 22:35:41 +000021 Several boolean instance attributes control various aspects of
Greg Ward00935822002-06-07 21:43:37 +000022 wrapping:
Greg Ward62e4f3b2002-06-07 21:56:16 +000023 expand_tabs (default: true)
24 Expand tabs in input text to spaces before further processing.
25 Each tab will become 1 .. 8 spaces, depending on its position in
26 its line. If false, each tab is treated as a single character.
27 replace_whitespace (default: true)
28 Replace all whitespace characters in the input text by spaces
29 after tab expansion. Note that if expand_tabs is false and
30 replace_whitespace is true, every tab will be converted to a
31 single space!
32 fix_sentence_endings (default: false)
33 Ensure that sentence-ending punctuation is always followed
34 by two spaces. Off by default becaus the algorithm is
35 (unavoidably) imperfect.
36 break_long_words (default: true)
37 Break words longer than the line width constraint. If false,
38 those words will not be broken, and some lines might be longer
39 than the width constraint.
Greg Ward00935822002-06-07 21:43:37 +000040 """
41
42 whitespace_trans = string.maketrans(string.whitespace,
43 ' ' * len(string.whitespace))
44
45 # This funky little regex is just the trick for splitting
46 # text up into word-wrappable chunks. E.g.
47 # "Hello there -- you goof-ball, use the -b option!"
48 # splits into
49 # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
50 # (after stripping out empty strings).
51 wordsep_re = re.compile(r'(\s+|' # any whitespace
52 r'\w{2,}-(?=\w{2,})|' # hyphenated words
53 r'(?<=\w)-{2,}(?=\w))') # em-dash
54
Greg Ward9b4864e2002-06-07 22:04:15 +000055 # XXX will there be a locale-or-charset-aware version of
56 # string.lowercase in 2.3?
57 sentence_end_re = re.compile(r'[%s]' # lowercase letter
58 r'[\.\!\?]' # sentence-ending punct.
59 r'[\"\']?' # optional end-of-quote
60 % string.lowercase)
Greg Ward62e4f3b2002-06-07 21:56:16 +000061
Greg Ward00935822002-06-07 21:43:37 +000062
63 def __init__ (self):
Greg Ward70c726a2002-06-07 22:35:41 +000064 self.expand_tabs = True
65 self.replace_whitespace = True
66 self.fix_sentence_endings = False
67 self.break_long_words = True
Greg Ward00935822002-06-07 21:43:37 +000068
69
70 # -- Private methods -----------------------------------------------
71 # (possibly useful for subclasses to override)
72
Greg Wardcb320eb2002-06-07 22:32:15 +000073 def _munge_whitespace(self, text):
Greg Ward00935822002-06-07 21:43:37 +000074 """_munge_whitespace(text : string) -> string
75
76 Munge whitespace in text: expand tabs and convert all other
77 whitespace characters to spaces. Eg. " foo\tbar\n\nbaz"
78 becomes " foo bar baz".
79 """
80 if self.expand_tabs:
81 text = text.expandtabs()
82 if self.replace_whitespace:
83 text = text.translate(self.whitespace_trans)
84 return text
85
86
Greg Wardcb320eb2002-06-07 22:32:15 +000087 def _split(self, text):
Greg Ward00935822002-06-07 21:43:37 +000088 """_split(text : string) -> [string]
89
90 Split the text to wrap into indivisible chunks. Chunks are
91 not quite the same as words; see wrap_chunks() for full
92 details. As an example, the text
93 Look, goof-ball -- use the -b option!
94 breaks into the following chunks:
95 'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
96 'use', ' ', 'the', ' ', '-b', ' ', 'option!'
97 """
98 chunks = self.wordsep_re.split(text)
99 chunks = filter(None, chunks)
100 return chunks
101
Greg Wardcb320eb2002-06-07 22:32:15 +0000102 def _fix_sentence_endings(self, chunks):
Greg Ward00935822002-06-07 21:43:37 +0000103 """_fix_sentence_endings(chunks : [string])
104
105 Correct for sentence endings buried in 'chunks'. Eg. when the
106 original text contains "... foo.\nBar ...", munge_whitespace()
107 and split() will convert that to [..., "foo.", " ", "Bar", ...]
108 which has one too few spaces; this method simply changes the one
109 space to two.
110 """
111 i = 0
Greg Ward9b4864e2002-06-07 22:04:15 +0000112 pat = self.sentence_end_re
Greg Ward00935822002-06-07 21:43:37 +0000113 while i < len(chunks)-1:
Greg Ward9b4864e2002-06-07 22:04:15 +0000114 if chunks[i+1] == " " and pat.search(chunks[i]):
Greg Ward00935822002-06-07 21:43:37 +0000115 chunks[i+1] = " "
116 i += 2
117 else:
118 i += 1
119
Greg Wardcb320eb2002-06-07 22:32:15 +0000120 def _handle_long_word(self, chunks, cur_line, cur_len, width):
Greg Ward00935822002-06-07 21:43:37 +0000121 """_handle_long_word(chunks : [string],
122 cur_line : [string],
123 cur_len : int, width : int)
124
125 Handle a chunk of text (most likely a word, not whitespace) that
126 is too long to fit in any line.
127 """
128 space_left = width - cur_len
129
130 # If we're allowed to break long words, then do so: put as much
131 # of the next chunk onto the current line as will fit.
132 if self.break_long_words:
133 cur_line.append(chunks[0][0:space_left])
134 chunks[0] = chunks[0][space_left:]
135
136 # Otherwise, we have to preserve the long word intact. Only add
137 # it to the current line if there's nothing already there --
138 # that minimizes how much we violate the width constraint.
139 elif not cur_line:
140 cur_line.append(chunks.pop(0))
141
142 # If we're not allowed to break long words, and there's already
143 # text on the current line, do nothing. Next time through the
144 # main loop of _wrap_chunks(), we'll wind up here again, but
145 # cur_len will be zero, so the next line will be entirely
146 # devoted to the long word that we can't handle right now.
147
Greg Wardcb320eb2002-06-07 22:32:15 +0000148 def _wrap_chunks(self, chunks, width):
Greg Ward00935822002-06-07 21:43:37 +0000149 """_wrap_chunks(chunks : [string], width : int) -> [string]
150
151 Wrap a sequence of text chunks and return a list of lines of
152 length 'width' or less. (If 'break_long_words' is false, some
153 lines may be longer than 'width'.) Chunks correspond roughly to
154 words and the whitespace between them: each chunk is indivisible
155 (modulo 'break_long_words'), but a line break can come between
156 any two chunks. Chunks should not have internal whitespace;
157 ie. a chunk is either all whitespace or a "word". Whitespace
158 chunks will be removed from the beginning and end of lines, but
159 apart from that whitespace is preserved.
160 """
161 lines = []
162
163 while chunks:
164
165 cur_line = [] # list of chunks (to-be-joined)
166 cur_len = 0 # length of current line
167
168 # First chunk on line is whitespace -- drop it.
169 if chunks[0].strip() == '':
170 del chunks[0]
171
172 while chunks:
173 l = len(chunks[0])
174
175 # Can at least squeeze this chunk onto the current line.
176 if cur_len + l <= width:
177 cur_line.append(chunks.pop(0))
178 cur_len += l
179
180 # Nope, this line is full.
181 else:
182 break
183
184 # The current line is full, and the next chunk is too big to
185 # fit on *any* line (not just this one).
186 if chunks and len(chunks[0]) > width:
187 self._handle_long_word(chunks, cur_line, cur_len, width)
188
189 # If the last chunk on this line is all whitespace, drop it.
190 if cur_line and cur_line[-1].strip() == '':
191 del cur_line[-1]
192
193 # Convert current line back to a string and store it in list
194 # of all lines (return value).
195 if cur_line:
196 lines.append(''.join(cur_line))
197
198 return lines
199
200
201 # -- Public interface ----------------------------------------------
202
Greg Wardcb320eb2002-06-07 22:32:15 +0000203 def wrap(self, text, width):
Greg Ward00935822002-06-07 21:43:37 +0000204 """wrap(text : string, width : int) -> [string]
205
206 Split 'text' into multiple lines of no more than 'width'
207 characters each, and return the list of strings that results.
208 Tabs in 'text' are expanded with string.expandtabs(), and all
209 other whitespace characters (including newline) are converted to
210 space.
211 """
212 text = self._munge_whitespace(text)
213 if len(text) <= width:
214 return [text]
215 chunks = self._split(text)
Greg Ward62e4f3b2002-06-07 21:56:16 +0000216 if self.fix_sentence_endings:
217 self._fix_sentence_endings(chunks)
Greg Ward00935822002-06-07 21:43:37 +0000218 return self._wrap_chunks(chunks, width)
219
Greg Wardcb320eb2002-06-07 22:32:15 +0000220 def fill(self, text, width, initial_tab="", subsequent_tab=""):
Greg Ward00935822002-06-07 21:43:37 +0000221 """fill(text : string,
222 width : int,
223 initial_tab : string = "",
224 subsequent_tab : string = "")
225 -> string
226
227 Reformat the paragraph in 'text' to fit in lines of no more than
228 'width' columns. The first line is prefixed with 'initial_tab',
229 and subsequent lines are prefixed with 'subsequent_tab'; the
230 lengths of the tab strings are accounted for when wrapping lines
231 to fit in 'width' columns.
232 """
233 lines = self.wrap(text, width)
234 sep = "\n" + subsequent_tab
235 return initial_tab + sep.join(lines)
236
237
238# Convenience interface
239
240_wrapper = TextWrapper()
241
Greg Wardcb320eb2002-06-07 22:32:15 +0000242def wrap(text, width):
Greg Ward00935822002-06-07 21:43:37 +0000243 return _wrapper.wrap(text, width)
244
Greg Wardcb320eb2002-06-07 22:32:15 +0000245def fill(text, width, initial_tab="", subsequent_tab=""):
Greg Ward00935822002-06-07 21:43:37 +0000246 return _wrapper.fill(text, width, initial_tab, subsequent_tab)