blob: 67efd65e36d80f859b37659ae586a24a6af4b820 [file] [log] [blame]
Greg Wardd1dc4751999-01-13 16:12:04 +00001"""text_file
2
3provides the TextFile class, which gives an interface to text files
4that (optionally) takes care of stripping comments, ignoring blank
5lines, and joining lines with backslashes."""
6
Greg Wardd1dc4751999-01-13 16:12:04 +00007__revision__ = "$Id$"
8
9from types import *
Greg Ward60cd2862000-09-16 18:04:55 +000010import sys, os, string
Greg Wardd1dc4751999-01-13 16:12:04 +000011
12
13class TextFile:
Greg Wardd1dc4751999-01-13 16:12:04 +000014
Greg Ward274ad9d1999-09-29 13:03:32 +000015 """Provides a file-like object that takes care of all the things you
16 commonly want to do when processing a text file that has some
Greg Ward60cd2862000-09-16 18:04:55 +000017 line-by-line syntax: strip comments (as long as "#" is your
18 comment character), skip blank lines, join adjacent lines by
19 escaping the newline (ie. backslash at end of line), strip
20 leading and/or trailing whitespace. All of these are optional
21 and independently controllable.
Greg Ward274ad9d1999-09-29 13:03:32 +000022
23 Provides a 'warn()' method so you can generate warning messages that
24 report physical line number, even if the logical line in question
25 spans multiple physical lines. Also provides 'unreadline()' for
26 implementing line-at-a-time lookahead.
27
28 Constructor is called as:
29
30 TextFile (filename=None, file=None, **options)
31
32 It bombs (RuntimeError) if both 'filename' and 'file' are None;
33 'filename' should be a string, and 'file' a file object (or
34 something that provides 'readline()' and 'close()' methods). It is
35 recommended that you supply at least 'filename', so that TextFile
36 can include it in warning messages. If 'file' is not supplied,
37 TextFile creates its own using the 'open()' builtin.
38
39 The options are all boolean, and affect the value returned by
40 'readline()':
41 strip_comments [default: true]
42 strip from "#" to end-of-line, as well as any whitespace
43 leading up to the "#" -- unless it is escaped by a backslash
44 lstrip_ws [default: false]
45 strip leading whitespace from each line before returning it
46 rstrip_ws [default: true]
47 strip trailing whitespace (including line terminator!) from
48 each line before returning it
49 skip_blanks [default: true}
50 skip lines that are empty *after* stripping comments and
Greg Ward60cd2862000-09-16 18:04:55 +000051 whitespace. (If both lstrip_ws and rstrip_ws are false,
Greg Ward274ad9d1999-09-29 13:03:32 +000052 then some lines may consist of solely whitespace: these will
53 *not* be skipped, even if 'skip_blanks' is true.)
54 join_lines [default: false]
55 if a backslash is the last non-newline character on a line
56 after stripping comments and whitespace, join the following line
57 to it to form one "logical line"; if N consecutive lines end
58 with a backslash, then N+1 physical lines will be joined to
59 form one logical line.
Greg Ward60cd2862000-09-16 18:04:55 +000060 collapse_join [default: false]
61 strip leading whitespace from lines that are joined to their
62 predecessor; only matters if (join_lines and not lstrip_ws)
Greg Ward274ad9d1999-09-29 13:03:32 +000063
64 Note that since 'rstrip_ws' can strip the trailing newline, the
65 semantics of 'readline()' must differ from those of the builtin file
66 object's 'readline()' method! In particular, 'readline()' returns
67 None for end-of-file: an empty string might just be a blank line (or
68 an all-whitespace line), if 'rstrip_ws' is true but 'skip_blanks' is
69 not."""
70
Greg Wardd1dc4751999-01-13 16:12:04 +000071 default_options = { 'strip_comments': 1,
Greg Wardd1dc4751999-01-13 16:12:04 +000072 'skip_blanks': 1,
Greg Wardd1dc4751999-01-13 16:12:04 +000073 'lstrip_ws': 0,
74 'rstrip_ws': 1,
Greg Ward60cd2862000-09-16 18:04:55 +000075 'join_lines': 0,
76 'collapse_join': 0,
Greg Wardd1dc4751999-01-13 16:12:04 +000077 }
78
Greg Ward782cdfe1999-03-23 14:00:06 +000079 def __init__ (self, filename=None, file=None, **options):
Greg Ward274ad9d1999-09-29 13:03:32 +000080 """Construct a new TextFile object. At least one of 'filename'
81 (a string) and 'file' (a file-like object) must be supplied.
82 They keyword argument options are described above and affect
83 the values returned by 'readline()'."""
Greg Ward782cdfe1999-03-23 14:00:06 +000084
85 if filename is None and file is None:
86 raise RuntimeError, \
Fred Drakeb94b8492001-12-06 20:51:35 +000087 "you must supply either or both of 'filename' and 'file'"
Greg Wardd1dc4751999-01-13 16:12:04 +000088
89 # set values for all options -- either from client option hash
90 # or fallback to default_options
91 for opt in self.default_options.keys():
92 if options.has_key (opt):
Greg Wardabc2f961999-08-10 20:09:38 +000093 setattr (self, opt, options[opt])
Greg Wardd1dc4751999-01-13 16:12:04 +000094
95 else:
96 setattr (self, opt, self.default_options[opt])
97
98 # sanity check client option hash
99 for opt in options.keys():
100 if not self.default_options.has_key (opt):
101 raise KeyError, "invalid TextFile option '%s'" % opt
102
Greg Ward782cdfe1999-03-23 14:00:06 +0000103 if file is None:
104 self.open (filename)
105 else:
106 self.filename = filename
107 self.file = file
108 self.current_line = 0 # assuming that file is at BOF!
Greg Ward787451b1999-03-26 21:48:59 +0000109
Greg Ward91c488c1999-03-29 18:01:49 +0000110 # 'linebuf' is a stack of lines that will be emptied before we
111 # actually read from the file; it's only populated by an
112 # 'unreadline()' operation
113 self.linebuf = []
Fred Drakeb94b8492001-12-06 20:51:35 +0000114
Greg Wardd1dc4751999-01-13 16:12:04 +0000115
Greg Ward782cdfe1999-03-23 14:00:06 +0000116 def open (self, filename):
Greg Ward274ad9d1999-09-29 13:03:32 +0000117 """Open a new file named 'filename'. This overrides both the
118 'filename' and 'file' arguments to the constructor."""
119
Greg Ward782cdfe1999-03-23 14:00:06 +0000120 self.filename = filename
Greg Wardd1dc4751999-01-13 16:12:04 +0000121 self.file = open (self.filename, 'r')
122 self.current_line = 0
123
124
125 def close (self):
Greg Ward274ad9d1999-09-29 13:03:32 +0000126 """Close the current file and forget everything we know about it
127 (filename, current line number)."""
128
Greg Wardd1dc4751999-01-13 16:12:04 +0000129 self.file.close ()
130 self.file = None
131 self.filename = None
132 self.current_line = None
133
134
Greg Wardf11296b2000-09-16 18:06:31 +0000135 def gen_error (self, msg, line=None):
136 outmsg = []
137 if line is None:
138 line = self.current_line
139 outmsg.append(self.filename + ", ")
140 if type (line) in (ListType, TupleType):
141 outmsg.append("lines %d-%d: " % tuple (line))
142 else:
143 outmsg.append("line %d: " % line)
144 outmsg.append(str(msg))
145 return string.join(outmsg, "")
146
147
148 def error (self, msg, line=None):
149 raise ValueError, "error: " + self.gen_error(msg, line)
150
Greg Ward4e5a7c71999-04-15 17:50:19 +0000151 def warn (self, msg, line=None):
Greg Ward274ad9d1999-09-29 13:03:32 +0000152 """Print (to stderr) a warning message tied to the current logical
153 line in the current file. If the current logical line in the
154 file spans multiple physical lines, the warning refers to the
155 whole range, eg. "lines 3-5". If 'line' supplied, it overrides
156 the current line number; it may be a list or tuple to indicate a
157 range of physical lines, or an integer for a single physical
158 line."""
Greg Wardf11296b2000-09-16 18:06:31 +0000159 sys.stderr.write("warning: " + self.gen_error(msg, line) + "\n")
Greg Wardf6cdcd51999-01-18 17:08:16 +0000160
161
Greg Wardd1dc4751999-01-13 16:12:04 +0000162 def readline (self):
Greg Ward274ad9d1999-09-29 13:03:32 +0000163 """Read and return a single logical line from the current file (or
164 from an internal buffer if lines have previously been "unread"
165 with 'unreadline()'). If the 'join_lines' option is true, this
166 may involve reading multiple physical lines concatenated into a
167 single string. Updates the current line number, so calling
168 'warn()' after 'readline()' emits a warning about the physical
169 line(s) just read. Returns None on end-of-file, since the empty
170 string can occur if 'rstrip_ws' is true but 'strip_blanks' is
171 not."""
Greg Wardd1dc4751999-01-13 16:12:04 +0000172
Greg Ward91c488c1999-03-29 18:01:49 +0000173 # If any "unread" lines waiting in 'linebuf', return the top
174 # one. (We don't actually buffer read-ahead data -- lines only
175 # get put in 'linebuf' if the client explicitly does an
176 # 'unreadline()'.
177 if self.linebuf:
178 line = self.linebuf[-1]
179 del self.linebuf[-1]
180 return line
181
Greg Wardd1dc4751999-01-13 16:12:04 +0000182 buildup_line = ''
183
184 while 1:
Greg Wardabc2f961999-08-10 20:09:38 +0000185 # read the line, make it None if EOF
Greg Wardd1dc4751999-01-13 16:12:04 +0000186 line = self.file.readline()
Greg Wardabc2f961999-08-10 20:09:38 +0000187 if line == '': line = None
188
Greg Wardd1dc4751999-01-13 16:12:04 +0000189 if self.strip_comments and line:
Greg Wardabc2f961999-08-10 20:09:38 +0000190
191 # Look for the first "#" in the line. If none, never
192 # mind. If we find one and it's the first character, or
193 # is not preceded by "\", then it starts a comment --
194 # strip the comment, strip whitespace before it, and
195 # carry on. Otherwise, it's just an escaped "#", so
196 # unescape it (and any other escaped "#"'s that might be
197 # lurking in there) and otherwise leave the line alone.
198
199 pos = string.find (line, "#")
200 if pos == -1: # no "#" -- no comments
201 pass
Greg Wardacff0b32000-09-16 18:33:36 +0000202
203 # It's definitely a comment -- either "#" is the first
204 # character, or it's elsewhere and unescaped.
205 elif pos == 0 or line[pos-1] != "\\":
Greg Ward274ad9d1999-09-29 13:03:32 +0000206 # Have to preserve the trailing newline, because it's
207 # the job of a later step (rstrip_ws) to remove it --
208 # and if rstrip_ws is false, we'd better preserve it!
209 # (NB. this means that if the final line is all comment
210 # and has no trailing newline, we will think that it's
Greg Wardabc2f961999-08-10 20:09:38 +0000211 # EOF; I think that's OK.)
Greg Ward274ad9d1999-09-29 13:03:32 +0000212 eol = (line[-1] == '\n') and '\n' or ''
213 line = line[0:pos] + eol
Fred Drakeb94b8492001-12-06 20:51:35 +0000214
Greg Wardacff0b32000-09-16 18:33:36 +0000215 # If all that's left is whitespace, then skip line
216 # *now*, before we try to join it to 'buildup_line' --
217 # that way constructs like
218 # hello \\
219 # # comment that should be ignored
220 # there
221 # result in "hello there".
222 if string.strip(line) == "":
223 continue
224
Greg Wardabc2f961999-08-10 20:09:38 +0000225 else: # it's an escaped "#"
226 line = string.replace (line, "\\#", "#")
Fred Drakeb94b8492001-12-06 20:51:35 +0000227
Greg Wardd1dc4751999-01-13 16:12:04 +0000228
229 # did previous line end with a backslash? then accumulate
230 if self.join_lines and buildup_line:
231 # oops: end of file
Greg Wardabc2f961999-08-10 20:09:38 +0000232 if line is None:
Greg Wardd1dc4751999-01-13 16:12:04 +0000233 self.warn ("continuation line immediately precedes "
234 "end-of-file")
235 return buildup_line
236
Greg Ward60cd2862000-09-16 18:04:55 +0000237 if self.collapse_join:
238 line = string.lstrip (line)
Greg Wardd1dc4751999-01-13 16:12:04 +0000239 line = buildup_line + line
240
241 # careful: pay attention to line number when incrementing it
242 if type (self.current_line) is ListType:
243 self.current_line[1] = self.current_line[1] + 1
244 else:
Greg Wardacff0b32000-09-16 18:33:36 +0000245 self.current_line = [self.current_line,
246 self.current_line+1]
Greg Wardd1dc4751999-01-13 16:12:04 +0000247 # just an ordinary line, read it as usual
248 else:
Greg Wardabc2f961999-08-10 20:09:38 +0000249 if line is None: # eof
Greg Wardd1dc4751999-01-13 16:12:04 +0000250 return None
251
252 # still have to be careful about incrementing the line number!
253 if type (self.current_line) is ListType:
254 self.current_line = self.current_line[1] + 1
255 else:
256 self.current_line = self.current_line + 1
Fred Drakeb94b8492001-12-06 20:51:35 +0000257
Greg Wardd1dc4751999-01-13 16:12:04 +0000258
259 # strip whitespace however the client wants (leading and
260 # trailing, or one or the other, or neither)
261 if self.lstrip_ws and self.rstrip_ws:
262 line = string.strip (line)
Greg Ward274ad9d1999-09-29 13:03:32 +0000263 elif self.lstrip_ws:
264 line = string.lstrip (line)
265 elif self.rstrip_ws:
266 line = string.rstrip (line)
Greg Wardd1dc4751999-01-13 16:12:04 +0000267
268 # blank line (whether we rstrip'ed or not)? skip to next line
269 # if appropriate
Greg Ward3d05c162000-09-16 18:09:22 +0000270 if (line == '' or line == '\n') and self.skip_blanks:
Greg Wardd1dc4751999-01-13 16:12:04 +0000271 continue
272
273 if self.join_lines:
274 if line[-1] == '\\':
275 buildup_line = line[:-1]
276 continue
277
278 if line[-2:] == '\\\n':
279 buildup_line = line[0:-2] + '\n'
280 continue
281
Greg Wardd1dc4751999-01-13 16:12:04 +0000282 # well, I guess there's some actual content there: return it
283 return line
284
Greg Wardacff0b32000-09-16 18:33:36 +0000285 # readline ()
Greg Wardd1dc4751999-01-13 16:12:04 +0000286
287
Greg Wardd1dc4751999-01-13 16:12:04 +0000288 def readlines (self):
Greg Ward274ad9d1999-09-29 13:03:32 +0000289 """Read and return the list of all logical lines remaining in the
290 current file."""
291
Greg Wardd1dc4751999-01-13 16:12:04 +0000292 lines = []
293 while 1:
294 line = self.readline()
295 if line is None:
296 return lines
297 lines.append (line)
298
299
Greg Ward91c488c1999-03-29 18:01:49 +0000300 def unreadline (self, line):
Greg Ward274ad9d1999-09-29 13:03:32 +0000301 """Push 'line' (a string) onto an internal buffer that will be
302 checked by future 'readline()' calls. Handy for implementing
303 a parser with line-at-a-time lookahead."""
304
Greg Ward91c488c1999-03-29 18:01:49 +0000305 self.linebuf.append (line)
306
307
Greg Wardd1dc4751999-01-13 16:12:04 +0000308if __name__ == "__main__":
309 test_data = """# test file
310
311line 3 \\
Greg Wardacff0b32000-09-16 18:33:36 +0000312# intervening comment
Greg Ward60cd2862000-09-16 18:04:55 +0000313 continues on next line
Greg Wardd1dc4751999-01-13 16:12:04 +0000314"""
Greg Wardd1dc4751999-01-13 16:12:04 +0000315 # result 1: no fancy options
316 result1 = map (lambda x: x + "\n", string.split (test_data, "\n")[0:-1])
317
318 # result 2: just strip comments
Greg Wardacff0b32000-09-16 18:33:36 +0000319 result2 = ["\n",
320 "line 3 \\\n",
321 " continues on next line\n"]
Greg Wardd1dc4751999-01-13 16:12:04 +0000322
323 # result 3: just strip blank lines
Greg Wardacff0b32000-09-16 18:33:36 +0000324 result3 = ["# test file\n",
325 "line 3 \\\n",
326 "# intervening comment\n",
327 " continues on next line\n"]
Greg Wardd1dc4751999-01-13 16:12:04 +0000328
329 # result 4: default, strip comments, blank lines, and trailing whitespace
Greg Wardacff0b32000-09-16 18:33:36 +0000330 result4 = ["line 3 \\",
331 " continues on next line"]
Greg Wardd1dc4751999-01-13 16:12:04 +0000332
Greg Ward60cd2862000-09-16 18:04:55 +0000333 # result 5: strip comments and blanks, plus join lines (but don't
334 # "collapse" joined lines
335 result5 = ["line 3 continues on next line"]
336
337 # result 6: strip comments and blanks, plus join lines (and
338 # "collapse" joined lines
339 result6 = ["line 3 continues on next line"]
Greg Wardd1dc4751999-01-13 16:12:04 +0000340
341 def test_input (count, description, file, expected_result):
342 result = file.readlines ()
343 # result = string.join (result, '')
344 if result == expected_result:
345 print "ok %d (%s)" % (count, description)
346 else:
347 print "not ok %d (%s):" % (count, description)
348 print "** expected:"
349 print expected_result
350 print "** received:"
351 print result
Fred Drakeb94b8492001-12-06 20:51:35 +0000352
Greg Wardd1dc4751999-01-13 16:12:04 +0000353
354 filename = "test.txt"
355 out_file = open (filename, "w")
356 out_file.write (test_data)
357 out_file.close ()
358
359 in_file = TextFile (filename, strip_comments=0, skip_blanks=0,
Greg Wardabc2f961999-08-10 20:09:38 +0000360 lstrip_ws=0, rstrip_ws=0)
Greg Wardd1dc4751999-01-13 16:12:04 +0000361 test_input (1, "no processing", in_file, result1)
362
363 in_file = TextFile (filename, strip_comments=1, skip_blanks=0,
Greg Wardabc2f961999-08-10 20:09:38 +0000364 lstrip_ws=0, rstrip_ws=0)
Greg Wardd1dc4751999-01-13 16:12:04 +0000365 test_input (2, "strip comments", in_file, result2)
366
367 in_file = TextFile (filename, strip_comments=0, skip_blanks=1,
Greg Wardabc2f961999-08-10 20:09:38 +0000368 lstrip_ws=0, rstrip_ws=0)
Greg Wardd1dc4751999-01-13 16:12:04 +0000369 test_input (3, "strip blanks", in_file, result3)
370
371 in_file = TextFile (filename)
372 test_input (4, "default processing", in_file, result4)
373
374 in_file = TextFile (filename, strip_comments=1, skip_blanks=1,
375 join_lines=1, rstrip_ws=1)
Greg Ward60cd2862000-09-16 18:04:55 +0000376 test_input (5, "join lines without collapsing", in_file, result5)
377
378 in_file = TextFile (filename, strip_comments=1, skip_blanks=1,
379 join_lines=1, rstrip_ws=1, collapse_join=1)
380 test_input (6, "join lines with collapsing", in_file, result6)
Greg Wardd1dc4751999-01-13 16:12:04 +0000381
382 os.remove (filename)