blob: 3230566076f9a68ac99dd2e5b8586c644a43d937 [file] [log] [blame]
Greg Ward00935822002-06-07 21:43:37 +00001"""
2Utilities for wrapping text strings and filling text paragraphs.
3"""
4
Greg Ward698d9f02002-06-07 22:40:23 +00005# Copyright (C) 2001 Gregory P. Ward.
6# Copyright (C) 2002 Python Software Foundation.
7# Written by Greg Ward <gward@python.net>
8
Greg Ward00935822002-06-07 21:43:37 +00009__revision__ = "$Id$"
10
11import string, re
12
Greg Ward00935822002-06-07 21:43:37 +000013class TextWrapper:
14 """
15 Object for wrapping/filling text. The public interface consists of
16 the wrap() and fill() methods; the other methods are just there for
17 subclasses to override in order to tweak the default behaviour.
18 If you want to completely replace the main wrapping algorithm,
19 you'll probably have to override _wrap_chunks().
20
Greg Wardd34c9592002-06-10 20:26:02 +000021 Several instance attributes control various aspects of wrapping:
22 width (default: 70)
23 the maximum width of wrapped lines (unless break_long_words
24 is false)
Greg Ward62080be2002-06-10 21:37:12 +000025 initial_indent (default: "")
26 string that will be prepended to the first line of wrapped
27 output. Counts towards the line's width.
28 subsequent_indent (default: "")
29 string that will be prepended to all lines save the first
30 of wrapped output; also counts towards each line's width.
Greg Ward62e4f3b2002-06-07 21:56:16 +000031 expand_tabs (default: true)
32 Expand tabs in input text to spaces before further processing.
33 Each tab will become 1 .. 8 spaces, depending on its position in
34 its line. If false, each tab is treated as a single character.
35 replace_whitespace (default: true)
36 Replace all whitespace characters in the input text by spaces
37 after tab expansion. Note that if expand_tabs is false and
38 replace_whitespace is true, every tab will be converted to a
39 single space!
40 fix_sentence_endings (default: false)
41 Ensure that sentence-ending punctuation is always followed
42 by two spaces. Off by default becaus the algorithm is
43 (unavoidably) imperfect.
44 break_long_words (default: true)
Greg Wardd34c9592002-06-10 20:26:02 +000045 Break words longer than 'width'. If false, those words will not
46 be broken, and some lines might be longer than 'width'.
Greg Ward00935822002-06-07 21:43:37 +000047 """
48
49 whitespace_trans = string.maketrans(string.whitespace,
50 ' ' * len(string.whitespace))
51
52 # This funky little regex is just the trick for splitting
53 # text up into word-wrappable chunks. E.g.
54 # "Hello there -- you goof-ball, use the -b option!"
55 # splits into
56 # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
57 # (after stripping out empty strings).
58 wordsep_re = re.compile(r'(\s+|' # any whitespace
59 r'\w{2,}-(?=\w{2,})|' # hyphenated words
60 r'(?<=\w)-{2,}(?=\w))') # em-dash
61
Greg Ward9b4864e2002-06-07 22:04:15 +000062 # XXX will there be a locale-or-charset-aware version of
63 # string.lowercase in 2.3?
64 sentence_end_re = re.compile(r'[%s]' # lowercase letter
65 r'[\.\!\?]' # sentence-ending punct.
66 r'[\"\']?' # optional end-of-quote
67 % string.lowercase)
Greg Ward62e4f3b2002-06-07 21:56:16 +000068
Greg Ward00935822002-06-07 21:43:37 +000069
Greg Ward47df99d2002-06-09 00:22:07 +000070 def __init__ (self,
Greg Wardd34c9592002-06-10 20:26:02 +000071 width=70,
Greg Ward62080be2002-06-10 21:37:12 +000072 initial_indent="",
73 subsequent_indent="",
Greg Ward47df99d2002-06-09 00:22:07 +000074 expand_tabs=True,
75 replace_whitespace=True,
76 fix_sentence_endings=False,
77 break_long_words=True):
Greg Wardd34c9592002-06-10 20:26:02 +000078 self.width = width
Greg Ward62080be2002-06-10 21:37:12 +000079 self.initial_indent = initial_indent
80 self.subsequent_indent = subsequent_indent
Greg Ward47df99d2002-06-09 00:22:07 +000081 self.expand_tabs = expand_tabs
82 self.replace_whitespace = replace_whitespace
83 self.fix_sentence_endings = fix_sentence_endings
84 self.break_long_words = break_long_words
Greg Ward00935822002-06-07 21:43:37 +000085
86
87 # -- Private methods -----------------------------------------------
88 # (possibly useful for subclasses to override)
89
Greg Wardcb320eb2002-06-07 22:32:15 +000090 def _munge_whitespace(self, text):
Greg Ward00935822002-06-07 21:43:37 +000091 """_munge_whitespace(text : string) -> string
92
93 Munge whitespace in text: expand tabs and convert all other
94 whitespace characters to spaces. Eg. " foo\tbar\n\nbaz"
95 becomes " foo bar baz".
96 """
97 if self.expand_tabs:
98 text = text.expandtabs()
99 if self.replace_whitespace:
100 text = text.translate(self.whitespace_trans)
101 return text
102
103
Greg Wardcb320eb2002-06-07 22:32:15 +0000104 def _split(self, text):
Greg Ward00935822002-06-07 21:43:37 +0000105 """_split(text : string) -> [string]
106
107 Split the text to wrap into indivisible chunks. Chunks are
108 not quite the same as words; see wrap_chunks() for full
109 details. As an example, the text
110 Look, goof-ball -- use the -b option!
111 breaks into the following chunks:
112 'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
113 'use', ' ', 'the', ' ', '-b', ' ', 'option!'
114 """
115 chunks = self.wordsep_re.split(text)
116 chunks = filter(None, chunks)
117 return chunks
118
Greg Wardcb320eb2002-06-07 22:32:15 +0000119 def _fix_sentence_endings(self, chunks):
Greg Ward00935822002-06-07 21:43:37 +0000120 """_fix_sentence_endings(chunks : [string])
121
122 Correct for sentence endings buried in 'chunks'. Eg. when the
123 original text contains "... foo.\nBar ...", munge_whitespace()
124 and split() will convert that to [..., "foo.", " ", "Bar", ...]
125 which has one too few spaces; this method simply changes the one
126 space to two.
127 """
128 i = 0
Greg Ward9b4864e2002-06-07 22:04:15 +0000129 pat = self.sentence_end_re
Greg Ward00935822002-06-07 21:43:37 +0000130 while i < len(chunks)-1:
Greg Ward9b4864e2002-06-07 22:04:15 +0000131 if chunks[i+1] == " " and pat.search(chunks[i]):
Greg Ward00935822002-06-07 21:43:37 +0000132 chunks[i+1] = " "
133 i += 2
134 else:
135 i += 1
136
Greg Ward62080be2002-06-10 21:37:12 +0000137 def _handle_long_word(self, chunks, cur_line, cur_len, width):
Greg Ward00935822002-06-07 21:43:37 +0000138 """_handle_long_word(chunks : [string],
139 cur_line : [string],
Greg Ward62080be2002-06-10 21:37:12 +0000140 cur_len : int, width : int)
Greg Ward00935822002-06-07 21:43:37 +0000141
142 Handle a chunk of text (most likely a word, not whitespace) that
143 is too long to fit in any line.
144 """
Greg Ward62080be2002-06-10 21:37:12 +0000145 space_left = width - cur_len
Greg Ward00935822002-06-07 21:43:37 +0000146
147 # If we're allowed to break long words, then do so: put as much
148 # of the next chunk onto the current line as will fit.
149 if self.break_long_words:
150 cur_line.append(chunks[0][0:space_left])
151 chunks[0] = chunks[0][space_left:]
152
153 # Otherwise, we have to preserve the long word intact. Only add
154 # it to the current line if there's nothing already there --
155 # that minimizes how much we violate the width constraint.
156 elif not cur_line:
157 cur_line.append(chunks.pop(0))
158
159 # If we're not allowed to break long words, and there's already
160 # text on the current line, do nothing. Next time through the
161 # main loop of _wrap_chunks(), we'll wind up here again, but
162 # cur_len will be zero, so the next line will be entirely
163 # devoted to the long word that we can't handle right now.
164
Greg Wardd34c9592002-06-10 20:26:02 +0000165 def _wrap_chunks(self, chunks):
166 """_wrap_chunks(chunks : [string]) -> [string]
Greg Ward00935822002-06-07 21:43:37 +0000167
168 Wrap a sequence of text chunks and return a list of lines of
Greg Wardd34c9592002-06-10 20:26:02 +0000169 length 'self.width' or less. (If 'break_long_words' is false,
170 some lines may be longer than this.) Chunks correspond roughly
171 to words and the whitespace between them: each chunk is
172 indivisible (modulo 'break_long_words'), but a line break can
173 come between any two chunks. Chunks should not have internal
174 whitespace; ie. a chunk is either all whitespace or a "word".
175 Whitespace chunks will be removed from the beginning and end of
176 lines, but apart from that whitespace is preserved.
Greg Ward00935822002-06-07 21:43:37 +0000177 """
178 lines = []
179
180 while chunks:
181
Greg Ward62080be2002-06-10 21:37:12 +0000182 # Start the list of chunks that will make up the current line.
183 # cur_len is just the length of all the chunks in cur_line.
184 cur_line = []
185 cur_len = 0
186
187 # Figure out which static string will prefix this line.
188 if lines:
189 indent = self.subsequent_indent
190 else:
191 indent = self.initial_indent
192
193 # Maximum width for this line.
194 width = self.width - len(indent)
Greg Ward00935822002-06-07 21:43:37 +0000195
196 # First chunk on line is whitespace -- drop it.
197 if chunks[0].strip() == '':
198 del chunks[0]
199
200 while chunks:
201 l = len(chunks[0])
202
203 # Can at least squeeze this chunk onto the current line.
204 if cur_len + l <= width:
205 cur_line.append(chunks.pop(0))
206 cur_len += l
207
208 # Nope, this line is full.
209 else:
210 break
211
212 # The current line is full, and the next chunk is too big to
213 # fit on *any* line (not just this one).
214 if chunks and len(chunks[0]) > width:
Greg Ward62080be2002-06-10 21:37:12 +0000215 self._handle_long_word(chunks, cur_line, cur_len, width)
Greg Ward00935822002-06-07 21:43:37 +0000216
217 # If the last chunk on this line is all whitespace, drop it.
218 if cur_line and cur_line[-1].strip() == '':
219 del cur_line[-1]
220
221 # Convert current line back to a string and store it in list
222 # of all lines (return value).
223 if cur_line:
Greg Ward62080be2002-06-10 21:37:12 +0000224 lines.append(indent + ''.join(cur_line))
Greg Ward00935822002-06-07 21:43:37 +0000225
226 return lines
227
228
229 # -- Public interface ----------------------------------------------
230
Greg Wardd34c9592002-06-10 20:26:02 +0000231 def wrap(self, text):
232 """wrap(text : string) -> [string]
Greg Ward00935822002-06-07 21:43:37 +0000233
Greg Wardd34c9592002-06-10 20:26:02 +0000234 Split 'text' into multiple lines of no more than 'self.width'
Greg Ward00935822002-06-07 21:43:37 +0000235 characters each, and return the list of strings that results.
236 Tabs in 'text' are expanded with string.expandtabs(), and all
237 other whitespace characters (including newline) are converted to
238 space.
239 """
240 text = self._munge_whitespace(text)
Greg Wardd34c9592002-06-10 20:26:02 +0000241 if len(text) <= self.width:
Greg Ward00935822002-06-07 21:43:37 +0000242 return [text]
243 chunks = self._split(text)
Greg Ward62e4f3b2002-06-07 21:56:16 +0000244 if self.fix_sentence_endings:
245 self._fix_sentence_endings(chunks)
Greg Wardd34c9592002-06-10 20:26:02 +0000246 return self._wrap_chunks(chunks)
Greg Ward00935822002-06-07 21:43:37 +0000247
Greg Ward62080be2002-06-10 21:37:12 +0000248 def fill(self, text):
249 """fill(text : string) -> string
Greg Ward00935822002-06-07 21:43:37 +0000250
251 Reformat the paragraph in 'text' to fit in lines of no more than
Greg Ward62080be2002-06-10 21:37:12 +0000252 'width' columns.
Greg Ward00935822002-06-07 21:43:37 +0000253 """
Greg Ward62080be2002-06-10 21:37:12 +0000254 return "\n".join(self.wrap(text))
Greg Ward00935822002-06-07 21:43:37 +0000255
256
257# Convenience interface
258
Greg Wardcf02ac62002-06-10 20:36:07 +0000259def wrap(text, width=70, **kwargs):
260 w = TextWrapper(width=width, **kwargs)
261 return w.wrap(text)
Greg Ward00935822002-06-07 21:43:37 +0000262
Greg Ward62080be2002-06-10 21:37:12 +0000263def fill(text, width=70, **kwargs):
Greg Wardcf02ac62002-06-10 20:36:07 +0000264 w = TextWrapper(width=width, **kwargs)
Greg Ward62080be2002-06-10 21:37:12 +0000265 return w.fill(text)