blob: 7b29ef4aa5c48fecd8a6f9263b194da55fe6629f [file] [log] [blame]
Greg Wardd1dc4751999-01-13 16:12:04 +00001"""text_file
2
3provides the TextFile class, which gives an interface to text files
4that (optionally) takes care of stripping comments, ignoring blank
5lines, and joining lines with backslashes."""
6
7# created 1999/01/12, Greg Ward
8
9__revision__ = "$Id$"
10
11from types import *
Greg Wardf6cdcd51999-01-18 17:08:16 +000012import sys, os, string, re
Greg Wardd1dc4751999-01-13 16:12:04 +000013
14
15class TextFile:
Greg Wardd1dc4751999-01-13 16:12:04 +000016
Greg Ward274ad9d1999-09-29 13:03:32 +000017 """Provides a file-like object that takes care of all the things you
18 commonly want to do when processing a text file that has some
19 line-by-line syntax: strip comments (as long as "#" is your comment
20 character), skip blank lines, join adjacent lines by escaping the
21 newline (ie. backslash at end of line), strip leading and/or
22 trailing whitespace, and collapse internal whitespace. All of these
23 are optional and independently controllable.
24
25 Provides a 'warn()' method so you can generate warning messages that
26 report physical line number, even if the logical line in question
27 spans multiple physical lines. Also provides 'unreadline()' for
28 implementing line-at-a-time lookahead.
29
30 Constructor is called as:
31
32 TextFile (filename=None, file=None, **options)
33
34 It bombs (RuntimeError) if both 'filename' and 'file' are None;
35 'filename' should be a string, and 'file' a file object (or
36 something that provides 'readline()' and 'close()' methods). It is
37 recommended that you supply at least 'filename', so that TextFile
38 can include it in warning messages. If 'file' is not supplied,
39 TextFile creates its own using the 'open()' builtin.
40
41 The options are all boolean, and affect the value returned by
42 'readline()':
43 strip_comments [default: true]
44 strip from "#" to end-of-line, as well as any whitespace
45 leading up to the "#" -- unless it is escaped by a backslash
46 lstrip_ws [default: false]
47 strip leading whitespace from each line before returning it
48 rstrip_ws [default: true]
49 strip trailing whitespace (including line terminator!) from
50 each line before returning it
51 skip_blanks [default: true}
52 skip lines that are empty *after* stripping comments and
53 whitespace. (If both lstrip_ws and rstrip_ws are true,
54 then some lines may consist of solely whitespace: these will
55 *not* be skipped, even if 'skip_blanks' is true.)
56 join_lines [default: false]
57 if a backslash is the last non-newline character on a line
58 after stripping comments and whitespace, join the following line
59 to it to form one "logical line"; if N consecutive lines end
60 with a backslash, then N+1 physical lines will be joined to
61 form one logical line.
62 collapse_ws [default: false]
63 after stripping comments and whitespace and joining physical
64 lines into logical lines, all internal whitespace (strings of
65 whitespace surrounded by non-whitespace characters, and not at
66 the beginning or end of the logical line) will be collapsed
67 to a single space.
68
69 Note that since 'rstrip_ws' can strip the trailing newline, the
70 semantics of 'readline()' must differ from those of the builtin file
71 object's 'readline()' method! In particular, 'readline()' returns
72 None for end-of-file: an empty string might just be a blank line (or
73 an all-whitespace line), if 'rstrip_ws' is true but 'skip_blanks' is
74 not."""
75
Greg Wardd1dc4751999-01-13 16:12:04 +000076 default_options = { 'strip_comments': 1,
Greg Wardd1dc4751999-01-13 16:12:04 +000077 'skip_blanks': 1,
78 'join_lines': 0,
79 'lstrip_ws': 0,
80 'rstrip_ws': 1,
Greg Warddb75afe1999-03-08 21:46:11 +000081 'collapse_ws': 0,
Greg Wardd1dc4751999-01-13 16:12:04 +000082 }
83
Greg Ward782cdfe1999-03-23 14:00:06 +000084 def __init__ (self, filename=None, file=None, **options):
Greg Ward274ad9d1999-09-29 13:03:32 +000085 """Construct a new TextFile object. At least one of 'filename'
86 (a string) and 'file' (a file-like object) must be supplied.
87 They keyword argument options are described above and affect
88 the values returned by 'readline()'."""
Greg Ward782cdfe1999-03-23 14:00:06 +000089
90 if filename is None and file is None:
91 raise RuntimeError, \
92 "you must supply either or both of 'filename' and 'file'"
Greg Wardd1dc4751999-01-13 16:12:04 +000093
94 # set values for all options -- either from client option hash
95 # or fallback to default_options
96 for opt in self.default_options.keys():
97 if options.has_key (opt):
Greg Wardabc2f961999-08-10 20:09:38 +000098 setattr (self, opt, options[opt])
Greg Wardd1dc4751999-01-13 16:12:04 +000099
100 else:
101 setattr (self, opt, self.default_options[opt])
102
103 # sanity check client option hash
104 for opt in options.keys():
105 if not self.default_options.has_key (opt):
106 raise KeyError, "invalid TextFile option '%s'" % opt
107
Greg Ward782cdfe1999-03-23 14:00:06 +0000108 if file is None:
109 self.open (filename)
110 else:
111 self.filename = filename
112 self.file = file
113 self.current_line = 0 # assuming that file is at BOF!
Greg Ward787451b1999-03-26 21:48:59 +0000114
Greg Ward91c488c1999-03-29 18:01:49 +0000115 # 'linebuf' is a stack of lines that will be emptied before we
116 # actually read from the file; it's only populated by an
117 # 'unreadline()' operation
118 self.linebuf = []
Greg Wardd1dc4751999-01-13 16:12:04 +0000119
120
Greg Ward782cdfe1999-03-23 14:00:06 +0000121 def open (self, filename):
Greg Ward274ad9d1999-09-29 13:03:32 +0000122 """Open a new file named 'filename'. This overrides both the
123 'filename' and 'file' arguments to the constructor."""
124
Greg Ward782cdfe1999-03-23 14:00:06 +0000125 self.filename = filename
Greg Wardd1dc4751999-01-13 16:12:04 +0000126 self.file = open (self.filename, 'r')
127 self.current_line = 0
128
129
130 def close (self):
Greg Ward274ad9d1999-09-29 13:03:32 +0000131 """Close the current file and forget everything we know about it
132 (filename, current line number)."""
133
Greg Wardd1dc4751999-01-13 16:12:04 +0000134 self.file.close ()
135 self.file = None
136 self.filename = None
137 self.current_line = None
138
139
Greg Ward4e5a7c71999-04-15 17:50:19 +0000140 def warn (self, msg, line=None):
Greg Ward274ad9d1999-09-29 13:03:32 +0000141 """Print (to stderr) a warning message tied to the current logical
142 line in the current file. If the current logical line in the
143 file spans multiple physical lines, the warning refers to the
144 whole range, eg. "lines 3-5". If 'line' supplied, it overrides
145 the current line number; it may be a list or tuple to indicate a
146 range of physical lines, or an integer for a single physical
147 line."""
148
Greg Ward4e5a7c71999-04-15 17:50:19 +0000149 if line is None:
150 line = self.current_line
Greg Wardf6cdcd51999-01-18 17:08:16 +0000151 sys.stderr.write (self.filename + ", ")
Greg Wardabc2f961999-08-10 20:09:38 +0000152 if type (line) in (ListType, TupleType):
Greg Ward4e5a7c71999-04-15 17:50:19 +0000153 sys.stderr.write ("lines %d-%d: " % tuple (line))
Greg Wardf6cdcd51999-01-18 17:08:16 +0000154 else:
Greg Ward4e5a7c71999-04-15 17:50:19 +0000155 sys.stderr.write ("line %d: " % line)
Greg Wardabc2f961999-08-10 20:09:38 +0000156 sys.stderr.write (str (msg) + "\n")
Greg Wardf6cdcd51999-01-18 17:08:16 +0000157
158
Greg Wardd1dc4751999-01-13 16:12:04 +0000159 def readline (self):
Greg Ward274ad9d1999-09-29 13:03:32 +0000160 """Read and return a single logical line from the current file (or
161 from an internal buffer if lines have previously been "unread"
162 with 'unreadline()'). If the 'join_lines' option is true, this
163 may involve reading multiple physical lines concatenated into a
164 single string. Updates the current line number, so calling
165 'warn()' after 'readline()' emits a warning about the physical
166 line(s) just read. Returns None on end-of-file, since the empty
167 string can occur if 'rstrip_ws' is true but 'strip_blanks' is
168 not."""
Greg Wardd1dc4751999-01-13 16:12:04 +0000169
Greg Ward91c488c1999-03-29 18:01:49 +0000170 # If any "unread" lines waiting in 'linebuf', return the top
171 # one. (We don't actually buffer read-ahead data -- lines only
172 # get put in 'linebuf' if the client explicitly does an
173 # 'unreadline()'.
174 if self.linebuf:
175 line = self.linebuf[-1]
176 del self.linebuf[-1]
177 return line
178
Greg Wardd1dc4751999-01-13 16:12:04 +0000179 buildup_line = ''
180
181 while 1:
Greg Wardabc2f961999-08-10 20:09:38 +0000182 # read the line, make it None if EOF
Greg Wardd1dc4751999-01-13 16:12:04 +0000183 line = self.file.readline()
Greg Wardabc2f961999-08-10 20:09:38 +0000184 if line == '': line = None
185
Greg Wardd1dc4751999-01-13 16:12:04 +0000186 if self.strip_comments and line:
Greg Wardabc2f961999-08-10 20:09:38 +0000187
188 # Look for the first "#" in the line. If none, never
189 # mind. If we find one and it's the first character, or
190 # is not preceded by "\", then it starts a comment --
191 # strip the comment, strip whitespace before it, and
192 # carry on. Otherwise, it's just an escaped "#", so
193 # unescape it (and any other escaped "#"'s that might be
194 # lurking in there) and otherwise leave the line alone.
195
196 pos = string.find (line, "#")
197 if pos == -1: # no "#" -- no comments
198 pass
199 elif pos == 0 or line[pos-1] != "\\": # it's a comment
Greg Ward274ad9d1999-09-29 13:03:32 +0000200
201 # Have to preserve the trailing newline, because it's
202 # the job of a later step (rstrip_ws) to remove it --
203 # and if rstrip_ws is false, we'd better preserve it!
204 # (NB. this means that if the final line is all comment
205 # and has no trailing newline, we will think that it's
Greg Wardabc2f961999-08-10 20:09:38 +0000206 # EOF; I think that's OK.)
Greg Ward274ad9d1999-09-29 13:03:32 +0000207 eol = (line[-1] == '\n') and '\n' or ''
208 line = line[0:pos] + eol
Greg Wardabc2f961999-08-10 20:09:38 +0000209
210 else: # it's an escaped "#"
211 line = string.replace (line, "\\#", "#")
212
Greg Wardd1dc4751999-01-13 16:12:04 +0000213
214 # did previous line end with a backslash? then accumulate
215 if self.join_lines and buildup_line:
216 # oops: end of file
Greg Wardabc2f961999-08-10 20:09:38 +0000217 if line is None:
Greg Wardd1dc4751999-01-13 16:12:04 +0000218 self.warn ("continuation line immediately precedes "
219 "end-of-file")
220 return buildup_line
221
222 line = buildup_line + line
223
224 # careful: pay attention to line number when incrementing it
225 if type (self.current_line) is ListType:
226 self.current_line[1] = self.current_line[1] + 1
227 else:
228 self.current_line = [self.current_line, self.current_line+1]
229 # just an ordinary line, read it as usual
230 else:
Greg Wardabc2f961999-08-10 20:09:38 +0000231 if line is None: # eof
Greg Wardd1dc4751999-01-13 16:12:04 +0000232 return None
233
234 # still have to be careful about incrementing the line number!
235 if type (self.current_line) is ListType:
236 self.current_line = self.current_line[1] + 1
237 else:
238 self.current_line = self.current_line + 1
Greg Ward91c488c1999-03-29 18:01:49 +0000239
Greg Wardd1dc4751999-01-13 16:12:04 +0000240
241 # strip whitespace however the client wants (leading and
242 # trailing, or one or the other, or neither)
243 if self.lstrip_ws and self.rstrip_ws:
244 line = string.strip (line)
Greg Ward274ad9d1999-09-29 13:03:32 +0000245 elif self.lstrip_ws:
246 line = string.lstrip (line)
247 elif self.rstrip_ws:
248 line = string.rstrip (line)
Greg Wardd1dc4751999-01-13 16:12:04 +0000249
250 # blank line (whether we rstrip'ed or not)? skip to next line
251 # if appropriate
252 if line == '' or line == '\n' and self.skip_blanks:
253 continue
254
255 if self.join_lines:
256 if line[-1] == '\\':
257 buildup_line = line[:-1]
258 continue
259
260 if line[-2:] == '\\\n':
261 buildup_line = line[0:-2] + '\n'
262 continue
263
Greg Warddb75afe1999-03-08 21:46:11 +0000264 # collapse internal whitespace (*after* joining lines!)
265 if self.collapse_ws:
266 line = re.sub (r'(\S)\s+(\S)', r'\1 \2', line)
267
Greg Wardd1dc4751999-01-13 16:12:04 +0000268 # well, I guess there's some actual content there: return it
269 return line
270
271 # end readline
272
273
Greg Wardd1dc4751999-01-13 16:12:04 +0000274 def readlines (self):
Greg Ward274ad9d1999-09-29 13:03:32 +0000275 """Read and return the list of all logical lines remaining in the
276 current file."""
277
Greg Wardd1dc4751999-01-13 16:12:04 +0000278 lines = []
279 while 1:
280 line = self.readline()
281 if line is None:
282 return lines
283 lines.append (line)
284
285
Greg Ward91c488c1999-03-29 18:01:49 +0000286 def unreadline (self, line):
Greg Ward274ad9d1999-09-29 13:03:32 +0000287 """Push 'line' (a string) onto an internal buffer that will be
288 checked by future 'readline()' calls. Handy for implementing
289 a parser with line-at-a-time lookahead."""
290
Greg Ward91c488c1999-03-29 18:01:49 +0000291 self.linebuf.append (line)
292
293
Greg Wardd1dc4751999-01-13 16:12:04 +0000294if __name__ == "__main__":
295 test_data = """# test file
296
297line 3 \\
298continues on next line
299"""
300
Greg Ward274ad9d1999-09-29 13:03:32 +0000301
Greg Wardd1dc4751999-01-13 16:12:04 +0000302 # result 1: no fancy options
303 result1 = map (lambda x: x + "\n", string.split (test_data, "\n")[0:-1])
304
305 # result 2: just strip comments
306 result2 = ["\n", "\n", "line 3 \\\n", "continues on next line\n"]
307
308 # result 3: just strip blank lines
309 result3 = ["# test file\n", "line 3 \\\n", "continues on next line\n"]
310
311 # result 4: default, strip comments, blank lines, and trailing whitespace
312 result4 = ["line 3 \\", "continues on next line"]
313
314 # result 5: full processing, strip comments and blanks, plus join lines
315 result5 = ["line 3 continues on next line"]
316
317 def test_input (count, description, file, expected_result):
318 result = file.readlines ()
319 # result = string.join (result, '')
320 if result == expected_result:
321 print "ok %d (%s)" % (count, description)
322 else:
323 print "not ok %d (%s):" % (count, description)
324 print "** expected:"
325 print expected_result
326 print "** received:"
327 print result
328
329
330 filename = "test.txt"
331 out_file = open (filename, "w")
332 out_file.write (test_data)
333 out_file.close ()
334
335 in_file = TextFile (filename, strip_comments=0, skip_blanks=0,
Greg Wardabc2f961999-08-10 20:09:38 +0000336 lstrip_ws=0, rstrip_ws=0)
Greg Wardd1dc4751999-01-13 16:12:04 +0000337 test_input (1, "no processing", in_file, result1)
338
339 in_file = TextFile (filename, strip_comments=1, skip_blanks=0,
Greg Wardabc2f961999-08-10 20:09:38 +0000340 lstrip_ws=0, rstrip_ws=0)
Greg Wardd1dc4751999-01-13 16:12:04 +0000341 test_input (2, "strip comments", in_file, result2)
342
343 in_file = TextFile (filename, strip_comments=0, skip_blanks=1,
Greg Wardabc2f961999-08-10 20:09:38 +0000344 lstrip_ws=0, rstrip_ws=0)
Greg Wardd1dc4751999-01-13 16:12:04 +0000345 test_input (3, "strip blanks", in_file, result3)
346
347 in_file = TextFile (filename)
348 test_input (4, "default processing", in_file, result4)
349
350 in_file = TextFile (filename, strip_comments=1, skip_blanks=1,
351 join_lines=1, rstrip_ws=1)
352 test_input (5, "full processing", in_file, result5)
353
354 os.remove (filename)
355