blob: de07c8d4b3e08a5f59b22d6fd9fbf0d560989c32 [file] [log] [blame]
Greg Warde807e572002-07-04 14:51:49 +00001"""Text wrapping and filling.
Greg Ward00935822002-06-07 21:43:37 +00002"""
3
Greg Ward78cc0512002-10-13 19:23:18 +00004# Copyright (C) 1999-2001 Gregory P. Ward.
Greg Ward698d9f02002-06-07 22:40:23 +00005# Copyright (C) 2002 Python Software Foundation.
6# Written by Greg Ward <gward@python.net>
7
Greg Ward4c486bc2002-10-22 18:31:50 +00008# XXX currently this module does not work very well with Unicode
9# strings. See http://www.python.org/sf/622831 for updates.
10
Greg Ward00935822002-06-07 21:43:37 +000011__revision__ = "$Id$"
12
13import string, re
14
Greg Wardafd44de2002-12-12 17:24:35 +000015# Hardcode the recognized whitespace characters to the US-ASCII
16# whitespace characters. The main reason for doing this is that in
17# ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales
18# that character winds up in string.whitespace. Respecting
19# string.whitespace in those cases would 1) make textwrap treat 0xa0 the
20# same as any other whitespace char, which is clearly wrong (it's a
21# *non-breaking* space), 2) possibly cause problems with Unicode,
22# since 0xa0 is not in range(128).
23whitespace = '\t\n\x0b\x0c\r '
24
Greg Ward00935822002-06-07 21:43:37 +000025class TextWrapper:
26 """
27 Object for wrapping/filling text. The public interface consists of
28 the wrap() and fill() methods; the other methods are just there for
29 subclasses to override in order to tweak the default behaviour.
30 If you want to completely replace the main wrapping algorithm,
31 you'll probably have to override _wrap_chunks().
32
Greg Wardd34c9592002-06-10 20:26:02 +000033 Several instance attributes control various aspects of wrapping:
34 width (default: 70)
35 the maximum width of wrapped lines (unless break_long_words
36 is false)
Greg Ward62080be2002-06-10 21:37:12 +000037 initial_indent (default: "")
38 string that will be prepended to the first line of wrapped
39 output. Counts towards the line's width.
40 subsequent_indent (default: "")
41 string that will be prepended to all lines save the first
42 of wrapped output; also counts towards each line's width.
Greg Ward62e4f3b2002-06-07 21:56:16 +000043 expand_tabs (default: true)
44 Expand tabs in input text to spaces before further processing.
45 Each tab will become 1 .. 8 spaces, depending on its position in
46 its line. If false, each tab is treated as a single character.
47 replace_whitespace (default: true)
48 Replace all whitespace characters in the input text by spaces
49 after tab expansion. Note that if expand_tabs is false and
50 replace_whitespace is true, every tab will be converted to a
51 single space!
52 fix_sentence_endings (default: false)
53 Ensure that sentence-ending punctuation is always followed
54 by two spaces. Off by default becaus the algorithm is
55 (unavoidably) imperfect.
56 break_long_words (default: true)
Greg Wardd34c9592002-06-10 20:26:02 +000057 Break words longer than 'width'. If false, those words will not
58 be broken, and some lines might be longer than 'width'.
Greg Ward00935822002-06-07 21:43:37 +000059 """
60
Greg Wardafd44de2002-12-12 17:24:35 +000061 whitespace_trans = string.maketrans(whitespace, ' ' * len(whitespace))
Greg Ward00935822002-06-07 21:43:37 +000062
Greg Ward2e745412002-12-09 16:23:08 +000063 unicode_whitespace_trans = {}
Greg Ward0e88c9f2002-12-11 13:54:20 +000064 uspace = ord(u' ')
Greg Wardafd44de2002-12-12 17:24:35 +000065 for x in map(ord, whitespace):
Greg Ward0e88c9f2002-12-11 13:54:20 +000066 unicode_whitespace_trans[x] = uspace
Greg Ward2e745412002-12-09 16:23:08 +000067
Tim Petersc411dba2002-07-16 21:35:23 +000068 # This funky little regex is just the trick for splitting
Greg Ward00935822002-06-07 21:43:37 +000069 # text up into word-wrappable chunks. E.g.
70 # "Hello there -- you goof-ball, use the -b option!"
71 # splits into
72 # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
73 # (after stripping out empty strings).
74 wordsep_re = re.compile(r'(\s+|' # any whitespace
Greg Wardcce4d672002-08-22 21:04:21 +000075 r'-*\w{2,}-(?=\w{2,})|' # hyphenated words
Greg Ward78cc0512002-10-13 19:23:18 +000076 r'(?<=\S)-{2,}(?=\w))') # em-dash
Greg Ward00935822002-06-07 21:43:37 +000077
Greg Ward9b4864e2002-06-07 22:04:15 +000078 # XXX will there be a locale-or-charset-aware version of
79 # string.lowercase in 2.3?
80 sentence_end_re = re.compile(r'[%s]' # lowercase letter
81 r'[\.\!\?]' # sentence-ending punct.
82 r'[\"\']?' # optional end-of-quote
83 % string.lowercase)
Greg Ward62e4f3b2002-06-07 21:56:16 +000084
Greg Ward00935822002-06-07 21:43:37 +000085
Greg Ward47df99d2002-06-09 00:22:07 +000086 def __init__ (self,
Greg Wardd34c9592002-06-10 20:26:02 +000087 width=70,
Greg Ward62080be2002-06-10 21:37:12 +000088 initial_indent="",
89 subsequent_indent="",
Greg Ward47df99d2002-06-09 00:22:07 +000090 expand_tabs=True,
91 replace_whitespace=True,
92 fix_sentence_endings=False,
93 break_long_words=True):
Greg Wardd34c9592002-06-10 20:26:02 +000094 self.width = width
Greg Ward62080be2002-06-10 21:37:12 +000095 self.initial_indent = initial_indent
96 self.subsequent_indent = subsequent_indent
Greg Ward47df99d2002-06-09 00:22:07 +000097 self.expand_tabs = expand_tabs
98 self.replace_whitespace = replace_whitespace
99 self.fix_sentence_endings = fix_sentence_endings
100 self.break_long_words = break_long_words
Tim Petersc411dba2002-07-16 21:35:23 +0000101
Greg Ward00935822002-06-07 21:43:37 +0000102
103 # -- Private methods -----------------------------------------------
104 # (possibly useful for subclasses to override)
105
Greg Wardcb320eb2002-06-07 22:32:15 +0000106 def _munge_whitespace(self, text):
Greg Ward00935822002-06-07 21:43:37 +0000107 """_munge_whitespace(text : string) -> string
108
109 Munge whitespace in text: expand tabs and convert all other
110 whitespace characters to spaces. Eg. " foo\tbar\n\nbaz"
111 becomes " foo bar baz".
112 """
113 if self.expand_tabs:
114 text = text.expandtabs()
115 if self.replace_whitespace:
Greg Ward2e745412002-12-09 16:23:08 +0000116 if isinstance(text, str):
117 text = text.translate(self.whitespace_trans)
118 elif isinstance(text, unicode):
119 text = text.translate(self.unicode_whitespace_trans)
Greg Ward00935822002-06-07 21:43:37 +0000120 return text
121
122
Greg Wardcb320eb2002-06-07 22:32:15 +0000123 def _split(self, text):
Greg Ward00935822002-06-07 21:43:37 +0000124 """_split(text : string) -> [string]
125
126 Split the text to wrap into indivisible chunks. Chunks are
127 not quite the same as words; see wrap_chunks() for full
128 details. As an example, the text
129 Look, goof-ball -- use the -b option!
130 breaks into the following chunks:
131 'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
132 'use', ' ', 'the', ' ', '-b', ' ', 'option!'
133 """
134 chunks = self.wordsep_re.split(text)
135 chunks = filter(None, chunks)
136 return chunks
137
Greg Wardcb320eb2002-06-07 22:32:15 +0000138 def _fix_sentence_endings(self, chunks):
Greg Ward00935822002-06-07 21:43:37 +0000139 """_fix_sentence_endings(chunks : [string])
140
141 Correct for sentence endings buried in 'chunks'. Eg. when the
142 original text contains "... foo.\nBar ...", munge_whitespace()
143 and split() will convert that to [..., "foo.", " ", "Bar", ...]
144 which has one too few spaces; this method simply changes the one
145 space to two.
146 """
147 i = 0
Greg Ward9b4864e2002-06-07 22:04:15 +0000148 pat = self.sentence_end_re
Greg Ward00935822002-06-07 21:43:37 +0000149 while i < len(chunks)-1:
Greg Ward9b4864e2002-06-07 22:04:15 +0000150 if chunks[i+1] == " " and pat.search(chunks[i]):
Greg Ward00935822002-06-07 21:43:37 +0000151 chunks[i+1] = " "
152 i += 2
153 else:
154 i += 1
155
Greg Ward62080be2002-06-10 21:37:12 +0000156 def _handle_long_word(self, chunks, cur_line, cur_len, width):
Greg Ward00935822002-06-07 21:43:37 +0000157 """_handle_long_word(chunks : [string],
158 cur_line : [string],
Greg Ward62080be2002-06-10 21:37:12 +0000159 cur_len : int, width : int)
Greg Ward00935822002-06-07 21:43:37 +0000160
161 Handle a chunk of text (most likely a word, not whitespace) that
162 is too long to fit in any line.
163 """
Greg Ward62080be2002-06-10 21:37:12 +0000164 space_left = width - cur_len
Greg Ward00935822002-06-07 21:43:37 +0000165
166 # If we're allowed to break long words, then do so: put as much
167 # of the next chunk onto the current line as will fit.
168 if self.break_long_words:
169 cur_line.append(chunks[0][0:space_left])
170 chunks[0] = chunks[0][space_left:]
171
172 # Otherwise, we have to preserve the long word intact. Only add
173 # it to the current line if there's nothing already there --
174 # that minimizes how much we violate the width constraint.
175 elif not cur_line:
176 cur_line.append(chunks.pop(0))
177
178 # If we're not allowed to break long words, and there's already
179 # text on the current line, do nothing. Next time through the
180 # main loop of _wrap_chunks(), we'll wind up here again, but
181 # cur_len will be zero, so the next line will be entirely
182 # devoted to the long word that we can't handle right now.
183
Greg Wardd34c9592002-06-10 20:26:02 +0000184 def _wrap_chunks(self, chunks):
185 """_wrap_chunks(chunks : [string]) -> [string]
Greg Ward00935822002-06-07 21:43:37 +0000186
187 Wrap a sequence of text chunks and return a list of lines of
Greg Wardd34c9592002-06-10 20:26:02 +0000188 length 'self.width' or less. (If 'break_long_words' is false,
189 some lines may be longer than this.) Chunks correspond roughly
190 to words and the whitespace between them: each chunk is
191 indivisible (modulo 'break_long_words'), but a line break can
192 come between any two chunks. Chunks should not have internal
193 whitespace; ie. a chunk is either all whitespace or a "word".
194 Whitespace chunks will be removed from the beginning and end of
195 lines, but apart from that whitespace is preserved.
Greg Ward00935822002-06-07 21:43:37 +0000196 """
197 lines = []
198
199 while chunks:
200
Greg Ward62080be2002-06-10 21:37:12 +0000201 # Start the list of chunks that will make up the current line.
202 # cur_len is just the length of all the chunks in cur_line.
203 cur_line = []
204 cur_len = 0
205
206 # Figure out which static string will prefix this line.
207 if lines:
208 indent = self.subsequent_indent
209 else:
210 indent = self.initial_indent
211
212 # Maximum width for this line.
213 width = self.width - len(indent)
Greg Ward00935822002-06-07 21:43:37 +0000214
Greg Wardab73d462002-12-09 16:26:05 +0000215 # First chunk on line is whitespace -- drop it, unless this
216 # is the very beginning of the text (ie. no lines started yet).
217 if chunks[0].strip() == '' and lines:
Greg Ward00935822002-06-07 21:43:37 +0000218 del chunks[0]
219
220 while chunks:
221 l = len(chunks[0])
222
223 # Can at least squeeze this chunk onto the current line.
224 if cur_len + l <= width:
225 cur_line.append(chunks.pop(0))
226 cur_len += l
227
228 # Nope, this line is full.
229 else:
230 break
231
232 # The current line is full, and the next chunk is too big to
Tim Petersc411dba2002-07-16 21:35:23 +0000233 # fit on *any* line (not just this one).
Greg Ward00935822002-06-07 21:43:37 +0000234 if chunks and len(chunks[0]) > width:
Greg Ward62080be2002-06-10 21:37:12 +0000235 self._handle_long_word(chunks, cur_line, cur_len, width)
Greg Ward00935822002-06-07 21:43:37 +0000236
237 # If the last chunk on this line is all whitespace, drop it.
238 if cur_line and cur_line[-1].strip() == '':
239 del cur_line[-1]
240
241 # Convert current line back to a string and store it in list
242 # of all lines (return value).
243 if cur_line:
Greg Ward62080be2002-06-10 21:37:12 +0000244 lines.append(indent + ''.join(cur_line))
Greg Ward00935822002-06-07 21:43:37 +0000245
246 return lines
247
248
249 # -- Public interface ----------------------------------------------
250
Greg Wardd34c9592002-06-10 20:26:02 +0000251 def wrap(self, text):
252 """wrap(text : string) -> [string]
Greg Ward00935822002-06-07 21:43:37 +0000253
Greg Warde807e572002-07-04 14:51:49 +0000254 Reformat the single paragraph in 'text' so it fits in lines of
255 no more than 'self.width' columns, and return a list of wrapped
256 lines. Tabs in 'text' are expanded with string.expandtabs(),
257 and all other whitespace characters (including newline) are
258 converted to space.
Greg Ward00935822002-06-07 21:43:37 +0000259 """
260 text = self._munge_whitespace(text)
Guido van Rossumeb287a22002-10-02 15:47:32 +0000261 indent = self.initial_indent
262 if len(text) + len(indent) <= self.width:
263 return [indent + text]
Greg Ward00935822002-06-07 21:43:37 +0000264 chunks = self._split(text)
Greg Ward62e4f3b2002-06-07 21:56:16 +0000265 if self.fix_sentence_endings:
266 self._fix_sentence_endings(chunks)
Greg Wardd34c9592002-06-10 20:26:02 +0000267 return self._wrap_chunks(chunks)
Greg Ward00935822002-06-07 21:43:37 +0000268
Greg Ward62080be2002-06-10 21:37:12 +0000269 def fill(self, text):
270 """fill(text : string) -> string
Greg Ward00935822002-06-07 21:43:37 +0000271
Greg Warde807e572002-07-04 14:51:49 +0000272 Reformat the single paragraph in 'text' to fit in lines of no
273 more than 'self.width' columns, and return a new string
274 containing the entire wrapped paragraph.
Greg Ward00935822002-06-07 21:43:37 +0000275 """
Greg Ward62080be2002-06-10 21:37:12 +0000276 return "\n".join(self.wrap(text))
Greg Ward00935822002-06-07 21:43:37 +0000277
278
Greg Warde807e572002-07-04 14:51:49 +0000279# -- Convenience interface ---------------------------------------------
Greg Ward00935822002-06-07 21:43:37 +0000280
Greg Wardcf02ac62002-06-10 20:36:07 +0000281def wrap(text, width=70, **kwargs):
Greg Warde807e572002-07-04 14:51:49 +0000282 """Wrap a single paragraph of text, returning a list of wrapped lines.
283
284 Reformat the single paragraph in 'text' so it fits in lines of no
285 more than 'width' columns, and return a list of wrapped lines. By
286 default, tabs in 'text' are expanded with string.expandtabs(), and
287 all other whitespace characters (including newline) are converted to
288 space. See TextWrapper class for available keyword args to customize
289 wrapping behaviour.
290 """
Greg Wardcf02ac62002-06-10 20:36:07 +0000291 w = TextWrapper(width=width, **kwargs)
292 return w.wrap(text)
Greg Ward00935822002-06-07 21:43:37 +0000293
Greg Ward62080be2002-06-10 21:37:12 +0000294def fill(text, width=70, **kwargs):
Greg Warde807e572002-07-04 14:51:49 +0000295 """Fill a single paragraph of text, returning a new string.
296
297 Reformat the single paragraph in 'text' to fit in lines of no more
298 than 'width' columns, and return a new string containing the entire
299 wrapped paragraph. As with wrap(), tabs are expanded and other
300 whitespace characters converted to space. See TextWrapper class for
301 available keyword args to customize wrapping behaviour.
302 """
Greg Wardcf02ac62002-06-10 20:36:07 +0000303 w = TextWrapper(width=width, **kwargs)
Greg Ward62080be2002-06-10 21:37:12 +0000304 return w.fill(text)