blob: f22b3e91675176eab243112c48881f039663d40e [file] [log] [blame]
Greg Wardd1dc4751999-01-13 16:12:04 +00001"""text_file
2
3provides the TextFile class, which gives an interface to text files
4that (optionally) takes care of stripping comments, ignoring blank
5lines, and joining lines with backslashes."""
6
7# created 1999/01/12, Greg Ward
8
9__revision__ = "$Id$"
10
11from types import *
Greg Ward60cd2862000-09-16 18:04:55 +000012import sys, os, string
Greg Wardd1dc4751999-01-13 16:12:04 +000013
14
15class TextFile:
Greg Wardd1dc4751999-01-13 16:12:04 +000016
Greg Ward274ad9d1999-09-29 13:03:32 +000017 """Provides a file-like object that takes care of all the things you
18 commonly want to do when processing a text file that has some
Greg Ward60cd2862000-09-16 18:04:55 +000019 line-by-line syntax: strip comments (as long as "#" is your
20 comment character), skip blank lines, join adjacent lines by
21 escaping the newline (ie. backslash at end of line), strip
22 leading and/or trailing whitespace. All of these are optional
23 and independently controllable.
Greg Ward274ad9d1999-09-29 13:03:32 +000024
25 Provides a 'warn()' method so you can generate warning messages that
26 report physical line number, even if the logical line in question
27 spans multiple physical lines. Also provides 'unreadline()' for
28 implementing line-at-a-time lookahead.
29
30 Constructor is called as:
31
32 TextFile (filename=None, file=None, **options)
33
34 It bombs (RuntimeError) if both 'filename' and 'file' are None;
35 'filename' should be a string, and 'file' a file object (or
36 something that provides 'readline()' and 'close()' methods). It is
37 recommended that you supply at least 'filename', so that TextFile
38 can include it in warning messages. If 'file' is not supplied,
39 TextFile creates its own using the 'open()' builtin.
40
41 The options are all boolean, and affect the value returned by
42 'readline()':
43 strip_comments [default: true]
44 strip from "#" to end-of-line, as well as any whitespace
45 leading up to the "#" -- unless it is escaped by a backslash
46 lstrip_ws [default: false]
47 strip leading whitespace from each line before returning it
48 rstrip_ws [default: true]
49 strip trailing whitespace (including line terminator!) from
50 each line before returning it
51 skip_blanks [default: true}
52 skip lines that are empty *after* stripping comments and
Greg Ward60cd2862000-09-16 18:04:55 +000053 whitespace. (If both lstrip_ws and rstrip_ws are false,
Greg Ward274ad9d1999-09-29 13:03:32 +000054 then some lines may consist of solely whitespace: these will
55 *not* be skipped, even if 'skip_blanks' is true.)
56 join_lines [default: false]
57 if a backslash is the last non-newline character on a line
58 after stripping comments and whitespace, join the following line
59 to it to form one "logical line"; if N consecutive lines end
60 with a backslash, then N+1 physical lines will be joined to
61 form one logical line.
Greg Ward60cd2862000-09-16 18:04:55 +000062 collapse_join [default: false]
63 strip leading whitespace from lines that are joined to their
64 predecessor; only matters if (join_lines and not lstrip_ws)
Greg Ward274ad9d1999-09-29 13:03:32 +000065
66 Note that since 'rstrip_ws' can strip the trailing newline, the
67 semantics of 'readline()' must differ from those of the builtin file
68 object's 'readline()' method! In particular, 'readline()' returns
69 None for end-of-file: an empty string might just be a blank line (or
70 an all-whitespace line), if 'rstrip_ws' is true but 'skip_blanks' is
71 not."""
72
Greg Wardd1dc4751999-01-13 16:12:04 +000073 default_options = { 'strip_comments': 1,
Greg Wardd1dc4751999-01-13 16:12:04 +000074 'skip_blanks': 1,
Greg Wardd1dc4751999-01-13 16:12:04 +000075 'lstrip_ws': 0,
76 'rstrip_ws': 1,
Greg Ward60cd2862000-09-16 18:04:55 +000077 'join_lines': 0,
78 'collapse_join': 0,
Greg Wardd1dc4751999-01-13 16:12:04 +000079 }
80
Greg Ward782cdfe1999-03-23 14:00:06 +000081 def __init__ (self, filename=None, file=None, **options):
Greg Ward274ad9d1999-09-29 13:03:32 +000082 """Construct a new TextFile object. At least one of 'filename'
83 (a string) and 'file' (a file-like object) must be supplied.
84 They keyword argument options are described above and affect
85 the values returned by 'readline()'."""
Greg Ward782cdfe1999-03-23 14:00:06 +000086
87 if filename is None and file is None:
88 raise RuntimeError, \
89 "you must supply either or both of 'filename' and 'file'"
Greg Wardd1dc4751999-01-13 16:12:04 +000090
91 # set values for all options -- either from client option hash
92 # or fallback to default_options
93 for opt in self.default_options.keys():
94 if options.has_key (opt):
Greg Wardabc2f961999-08-10 20:09:38 +000095 setattr (self, opt, options[opt])
Greg Wardd1dc4751999-01-13 16:12:04 +000096
97 else:
98 setattr (self, opt, self.default_options[opt])
99
100 # sanity check client option hash
101 for opt in options.keys():
102 if not self.default_options.has_key (opt):
103 raise KeyError, "invalid TextFile option '%s'" % opt
104
Greg Ward782cdfe1999-03-23 14:00:06 +0000105 if file is None:
106 self.open (filename)
107 else:
108 self.filename = filename
109 self.file = file
110 self.current_line = 0 # assuming that file is at BOF!
Greg Ward787451b1999-03-26 21:48:59 +0000111
Greg Ward91c488c1999-03-29 18:01:49 +0000112 # 'linebuf' is a stack of lines that will be emptied before we
113 # actually read from the file; it's only populated by an
114 # 'unreadline()' operation
115 self.linebuf = []
Greg Wardd1dc4751999-01-13 16:12:04 +0000116
117
Greg Ward782cdfe1999-03-23 14:00:06 +0000118 def open (self, filename):
Greg Ward274ad9d1999-09-29 13:03:32 +0000119 """Open a new file named 'filename'. This overrides both the
120 'filename' and 'file' arguments to the constructor."""
121
Greg Ward782cdfe1999-03-23 14:00:06 +0000122 self.filename = filename
Greg Wardd1dc4751999-01-13 16:12:04 +0000123 self.file = open (self.filename, 'r')
124 self.current_line = 0
125
126
127 def close (self):
Greg Ward274ad9d1999-09-29 13:03:32 +0000128 """Close the current file and forget everything we know about it
129 (filename, current line number)."""
130
Greg Wardd1dc4751999-01-13 16:12:04 +0000131 self.file.close ()
132 self.file = None
133 self.filename = None
134 self.current_line = None
135
136
Greg Ward4e5a7c71999-04-15 17:50:19 +0000137 def warn (self, msg, line=None):
Greg Ward274ad9d1999-09-29 13:03:32 +0000138 """Print (to stderr) a warning message tied to the current logical
139 line in the current file. If the current logical line in the
140 file spans multiple physical lines, the warning refers to the
141 whole range, eg. "lines 3-5". If 'line' supplied, it overrides
142 the current line number; it may be a list or tuple to indicate a
143 range of physical lines, or an integer for a single physical
144 line."""
145
Greg Ward4e5a7c71999-04-15 17:50:19 +0000146 if line is None:
147 line = self.current_line
Greg Wardf6cdcd51999-01-18 17:08:16 +0000148 sys.stderr.write (self.filename + ", ")
Greg Wardabc2f961999-08-10 20:09:38 +0000149 if type (line) in (ListType, TupleType):
Greg Ward4e5a7c71999-04-15 17:50:19 +0000150 sys.stderr.write ("lines %d-%d: " % tuple (line))
Greg Wardf6cdcd51999-01-18 17:08:16 +0000151 else:
Greg Ward4e5a7c71999-04-15 17:50:19 +0000152 sys.stderr.write ("line %d: " % line)
Greg Wardabc2f961999-08-10 20:09:38 +0000153 sys.stderr.write (str (msg) + "\n")
Greg Wardf6cdcd51999-01-18 17:08:16 +0000154
155
Greg Wardd1dc4751999-01-13 16:12:04 +0000156 def readline (self):
Greg Ward274ad9d1999-09-29 13:03:32 +0000157 """Read and return a single logical line from the current file (or
158 from an internal buffer if lines have previously been "unread"
159 with 'unreadline()'). If the 'join_lines' option is true, this
160 may involve reading multiple physical lines concatenated into a
161 single string. Updates the current line number, so calling
162 'warn()' after 'readline()' emits a warning about the physical
163 line(s) just read. Returns None on end-of-file, since the empty
164 string can occur if 'rstrip_ws' is true but 'strip_blanks' is
165 not."""
Greg Wardd1dc4751999-01-13 16:12:04 +0000166
Greg Ward91c488c1999-03-29 18:01:49 +0000167 # If any "unread" lines waiting in 'linebuf', return the top
168 # one. (We don't actually buffer read-ahead data -- lines only
169 # get put in 'linebuf' if the client explicitly does an
170 # 'unreadline()'.
171 if self.linebuf:
172 line = self.linebuf[-1]
173 del self.linebuf[-1]
174 return line
175
Greg Wardd1dc4751999-01-13 16:12:04 +0000176 buildup_line = ''
177
178 while 1:
Greg Wardabc2f961999-08-10 20:09:38 +0000179 # read the line, make it None if EOF
Greg Wardd1dc4751999-01-13 16:12:04 +0000180 line = self.file.readline()
Greg Wardabc2f961999-08-10 20:09:38 +0000181 if line == '': line = None
182
Greg Wardd1dc4751999-01-13 16:12:04 +0000183 if self.strip_comments and line:
Greg Wardabc2f961999-08-10 20:09:38 +0000184
185 # Look for the first "#" in the line. If none, never
186 # mind. If we find one and it's the first character, or
187 # is not preceded by "\", then it starts a comment --
188 # strip the comment, strip whitespace before it, and
189 # carry on. Otherwise, it's just an escaped "#", so
190 # unescape it (and any other escaped "#"'s that might be
191 # lurking in there) and otherwise leave the line alone.
192
193 pos = string.find (line, "#")
194 if pos == -1: # no "#" -- no comments
195 pass
196 elif pos == 0 or line[pos-1] != "\\": # it's a comment
Greg Ward274ad9d1999-09-29 13:03:32 +0000197
198 # Have to preserve the trailing newline, because it's
199 # the job of a later step (rstrip_ws) to remove it --
200 # and if rstrip_ws is false, we'd better preserve it!
201 # (NB. this means that if the final line is all comment
202 # and has no trailing newline, we will think that it's
Greg Wardabc2f961999-08-10 20:09:38 +0000203 # EOF; I think that's OK.)
Greg Ward274ad9d1999-09-29 13:03:32 +0000204 eol = (line[-1] == '\n') and '\n' or ''
205 line = line[0:pos] + eol
Greg Wardabc2f961999-08-10 20:09:38 +0000206
207 else: # it's an escaped "#"
208 line = string.replace (line, "\\#", "#")
209
Greg Wardd1dc4751999-01-13 16:12:04 +0000210
211 # did previous line end with a backslash? then accumulate
212 if self.join_lines and buildup_line:
213 # oops: end of file
Greg Wardabc2f961999-08-10 20:09:38 +0000214 if line is None:
Greg Wardd1dc4751999-01-13 16:12:04 +0000215 self.warn ("continuation line immediately precedes "
216 "end-of-file")
217 return buildup_line
218
Greg Ward60cd2862000-09-16 18:04:55 +0000219 if self.collapse_join:
220 line = string.lstrip (line)
Greg Wardd1dc4751999-01-13 16:12:04 +0000221 line = buildup_line + line
222
223 # careful: pay attention to line number when incrementing it
224 if type (self.current_line) is ListType:
225 self.current_line[1] = self.current_line[1] + 1
226 else:
227 self.current_line = [self.current_line, self.current_line+1]
228 # just an ordinary line, read it as usual
229 else:
Greg Wardabc2f961999-08-10 20:09:38 +0000230 if line is None: # eof
Greg Wardd1dc4751999-01-13 16:12:04 +0000231 return None
232
233 # still have to be careful about incrementing the line number!
234 if type (self.current_line) is ListType:
235 self.current_line = self.current_line[1] + 1
236 else:
237 self.current_line = self.current_line + 1
Greg Ward91c488c1999-03-29 18:01:49 +0000238
Greg Wardd1dc4751999-01-13 16:12:04 +0000239
240 # strip whitespace however the client wants (leading and
241 # trailing, or one or the other, or neither)
242 if self.lstrip_ws and self.rstrip_ws:
243 line = string.strip (line)
Greg Ward274ad9d1999-09-29 13:03:32 +0000244 elif self.lstrip_ws:
245 line = string.lstrip (line)
246 elif self.rstrip_ws:
247 line = string.rstrip (line)
Greg Wardd1dc4751999-01-13 16:12:04 +0000248
249 # blank line (whether we rstrip'ed or not)? skip to next line
250 # if appropriate
251 if line == '' or line == '\n' and self.skip_blanks:
252 continue
253
254 if self.join_lines:
255 if line[-1] == '\\':
256 buildup_line = line[:-1]
257 continue
258
259 if line[-2:] == '\\\n':
260 buildup_line = line[0:-2] + '\n'
261 continue
262
Greg Wardd1dc4751999-01-13 16:12:04 +0000263 # well, I guess there's some actual content there: return it
264 return line
265
266 # end readline
267
268
Greg Wardd1dc4751999-01-13 16:12:04 +0000269 def readlines (self):
Greg Ward274ad9d1999-09-29 13:03:32 +0000270 """Read and return the list of all logical lines remaining in the
271 current file."""
272
Greg Wardd1dc4751999-01-13 16:12:04 +0000273 lines = []
274 while 1:
275 line = self.readline()
276 if line is None:
277 return lines
278 lines.append (line)
279
280
Greg Ward91c488c1999-03-29 18:01:49 +0000281 def unreadline (self, line):
Greg Ward274ad9d1999-09-29 13:03:32 +0000282 """Push 'line' (a string) onto an internal buffer that will be
283 checked by future 'readline()' calls. Handy for implementing
284 a parser with line-at-a-time lookahead."""
285
Greg Ward91c488c1999-03-29 18:01:49 +0000286 self.linebuf.append (line)
287
288
Greg Wardd1dc4751999-01-13 16:12:04 +0000289if __name__ == "__main__":
290 test_data = """# test file
291
292line 3 \\
Greg Ward60cd2862000-09-16 18:04:55 +0000293 continues on next line
Greg Wardd1dc4751999-01-13 16:12:04 +0000294"""
295
Greg Ward274ad9d1999-09-29 13:03:32 +0000296
Greg Wardd1dc4751999-01-13 16:12:04 +0000297 # result 1: no fancy options
298 result1 = map (lambda x: x + "\n", string.split (test_data, "\n")[0:-1])
299
300 # result 2: just strip comments
Greg Ward60cd2862000-09-16 18:04:55 +0000301 result2 = ["\n", "\n", "line 3 \\\n", " continues on next line\n"]
Greg Wardd1dc4751999-01-13 16:12:04 +0000302
303 # result 3: just strip blank lines
Greg Ward60cd2862000-09-16 18:04:55 +0000304 result3 = ["# test file\n", "line 3 \\\n", " continues on next line\n"]
Greg Wardd1dc4751999-01-13 16:12:04 +0000305
306 # result 4: default, strip comments, blank lines, and trailing whitespace
Greg Ward60cd2862000-09-16 18:04:55 +0000307 result4 = ["line 3 \\", " continues on next line"]
Greg Wardd1dc4751999-01-13 16:12:04 +0000308
Greg Ward60cd2862000-09-16 18:04:55 +0000309 # result 5: strip comments and blanks, plus join lines (but don't
310 # "collapse" joined lines
311 result5 = ["line 3 continues on next line"]
312
313 # result 6: strip comments and blanks, plus join lines (and
314 # "collapse" joined lines
315 result6 = ["line 3 continues on next line"]
Greg Wardd1dc4751999-01-13 16:12:04 +0000316
317 def test_input (count, description, file, expected_result):
318 result = file.readlines ()
319 # result = string.join (result, '')
320 if result == expected_result:
321 print "ok %d (%s)" % (count, description)
322 else:
323 print "not ok %d (%s):" % (count, description)
324 print "** expected:"
325 print expected_result
326 print "** received:"
327 print result
328
329
330 filename = "test.txt"
331 out_file = open (filename, "w")
332 out_file.write (test_data)
333 out_file.close ()
334
335 in_file = TextFile (filename, strip_comments=0, skip_blanks=0,
Greg Wardabc2f961999-08-10 20:09:38 +0000336 lstrip_ws=0, rstrip_ws=0)
Greg Wardd1dc4751999-01-13 16:12:04 +0000337 test_input (1, "no processing", in_file, result1)
338
339 in_file = TextFile (filename, strip_comments=1, skip_blanks=0,
Greg Wardabc2f961999-08-10 20:09:38 +0000340 lstrip_ws=0, rstrip_ws=0)
Greg Wardd1dc4751999-01-13 16:12:04 +0000341 test_input (2, "strip comments", in_file, result2)
342
343 in_file = TextFile (filename, strip_comments=0, skip_blanks=1,
Greg Wardabc2f961999-08-10 20:09:38 +0000344 lstrip_ws=0, rstrip_ws=0)
Greg Wardd1dc4751999-01-13 16:12:04 +0000345 test_input (3, "strip blanks", in_file, result3)
346
347 in_file = TextFile (filename)
348 test_input (4, "default processing", in_file, result4)
349
350 in_file = TextFile (filename, strip_comments=1, skip_blanks=1,
351 join_lines=1, rstrip_ws=1)
Greg Ward60cd2862000-09-16 18:04:55 +0000352 test_input (5, "join lines without collapsing", in_file, result5)
353
354 in_file = TextFile (filename, strip_comments=1, skip_blanks=1,
355 join_lines=1, rstrip_ws=1, collapse_join=1)
356 test_input (6, "join lines with collapsing", in_file, result6)
Greg Wardd1dc4751999-01-13 16:12:04 +0000357
358 os.remove (filename)
359