blob: 454725c626da9f049a3c574cc1929acd7454e1c9 [file] [log] [blame]
Greg Wardd1dc4751999-01-13 16:12:04 +00001"""text_file
2
3provides the TextFile class, which gives an interface to text files
4that (optionally) takes care of stripping comments, ignoring blank
5lines, and joining lines with backslashes."""
6
Greg Wardd1dc4751999-01-13 16:12:04 +00007__revision__ = "$Id$"
8
Tarek Ziadé36797272010-07-22 12:50:05 +00009import sys, os, io
10
Greg Wardd1dc4751999-01-13 16:12:04 +000011
12class TextFile:
Greg Ward274ad9d1999-09-29 13:03:32 +000013 """Provides a file-like object that takes care of all the things you
14 commonly want to do when processing a text file that has some
Greg Ward60cd2862000-09-16 18:04:55 +000015 line-by-line syntax: strip comments (as long as "#" is your
16 comment character), skip blank lines, join adjacent lines by
17 escaping the newline (ie. backslash at end of line), strip
18 leading and/or trailing whitespace. All of these are optional
19 and independently controllable.
Greg Ward274ad9d1999-09-29 13:03:32 +000020
21 Provides a 'warn()' method so you can generate warning messages that
22 report physical line number, even if the logical line in question
23 spans multiple physical lines. Also provides 'unreadline()' for
24 implementing line-at-a-time lookahead.
25
26 Constructor is called as:
27
28 TextFile (filename=None, file=None, **options)
29
30 It bombs (RuntimeError) if both 'filename' and 'file' are None;
31 'filename' should be a string, and 'file' a file object (or
32 something that provides 'readline()' and 'close()' methods). It is
33 recommended that you supply at least 'filename', so that TextFile
34 can include it in warning messages. If 'file' is not supplied,
Guido van Rossum63236cf2007-05-25 18:39:29 +000035 TextFile creates its own using 'io.open()'.
Greg Ward274ad9d1999-09-29 13:03:32 +000036
37 The options are all boolean, and affect the value returned by
38 'readline()':
39 strip_comments [default: true]
40 strip from "#" to end-of-line, as well as any whitespace
41 leading up to the "#" -- unless it is escaped by a backslash
42 lstrip_ws [default: false]
43 strip leading whitespace from each line before returning it
44 rstrip_ws [default: true]
45 strip trailing whitespace (including line terminator!) from
46 each line before returning it
47 skip_blanks [default: true}
48 skip lines that are empty *after* stripping comments and
Greg Ward60cd2862000-09-16 18:04:55 +000049 whitespace. (If both lstrip_ws and rstrip_ws are false,
Greg Ward274ad9d1999-09-29 13:03:32 +000050 then some lines may consist of solely whitespace: these will
51 *not* be skipped, even if 'skip_blanks' is true.)
52 join_lines [default: false]
53 if a backslash is the last non-newline character on a line
54 after stripping comments and whitespace, join the following line
55 to it to form one "logical line"; if N consecutive lines end
56 with a backslash, then N+1 physical lines will be joined to
57 form one logical line.
Greg Ward60cd2862000-09-16 18:04:55 +000058 collapse_join [default: false]
59 strip leading whitespace from lines that are joined to their
60 predecessor; only matters if (join_lines and not lstrip_ws)
Victor Stinner75d8c5c2010-10-23 17:02:31 +000061 errors [default: 'strict']
62 error handler used to decode the file content
Greg Ward274ad9d1999-09-29 13:03:32 +000063
64 Note that since 'rstrip_ws' can strip the trailing newline, the
65 semantics of 'readline()' must differ from those of the builtin file
66 object's 'readline()' method! In particular, 'readline()' returns
67 None for end-of-file: an empty string might just be a blank line (or
68 an all-whitespace line), if 'rstrip_ws' is true but 'skip_blanks' is
69 not."""
70
Greg Wardd1dc4751999-01-13 16:12:04 +000071 default_options = { 'strip_comments': 1,
Greg Wardd1dc4751999-01-13 16:12:04 +000072 'skip_blanks': 1,
Greg Wardd1dc4751999-01-13 16:12:04 +000073 'lstrip_ws': 0,
74 'rstrip_ws': 1,
Greg Ward60cd2862000-09-16 18:04:55 +000075 'join_lines': 0,
76 'collapse_join': 0,
Victor Stinner75d8c5c2010-10-23 17:02:31 +000077 'errors': 'strict',
Greg Wardd1dc4751999-01-13 16:12:04 +000078 }
79
Collin Winter5b7e9d72007-08-30 03:52:21 +000080 def __init__(self, filename=None, file=None, **options):
Greg Ward274ad9d1999-09-29 13:03:32 +000081 """Construct a new TextFile object. At least one of 'filename'
82 (a string) and 'file' (a file-like object) must be supplied.
83 They keyword argument options are described above and affect
84 the values returned by 'readline()'."""
Greg Ward782cdfe1999-03-23 14:00:06 +000085 if filename is None and file is None:
Collin Winter5b7e9d72007-08-30 03:52:21 +000086 raise RuntimeError("you must supply either or both of 'filename' and 'file'")
Greg Wardd1dc4751999-01-13 16:12:04 +000087
88 # set values for all options -- either from client option hash
89 # or fallback to default_options
90 for opt in self.default_options.keys():
Guido van Rossume2b70bc2006-08-18 22:13:04 +000091 if opt in options:
Collin Winter5b7e9d72007-08-30 03:52:21 +000092 setattr(self, opt, options[opt])
Greg Wardd1dc4751999-01-13 16:12:04 +000093 else:
Collin Winter5b7e9d72007-08-30 03:52:21 +000094 setattr(self, opt, self.default_options[opt])
Greg Wardd1dc4751999-01-13 16:12:04 +000095
96 # sanity check client option hash
97 for opt in options.keys():
Guido van Rossume2b70bc2006-08-18 22:13:04 +000098 if opt not in self.default_options:
Collin Winter5b7e9d72007-08-30 03:52:21 +000099 raise KeyError("invalid TextFile option '%s'" % opt)
Greg Wardd1dc4751999-01-13 16:12:04 +0000100
Greg Ward782cdfe1999-03-23 14:00:06 +0000101 if file is None:
Collin Winter5b7e9d72007-08-30 03:52:21 +0000102 self.open(filename)
Greg Ward782cdfe1999-03-23 14:00:06 +0000103 else:
104 self.filename = filename
105 self.file = file
106 self.current_line = 0 # assuming that file is at BOF!
Greg Ward787451b1999-03-26 21:48:59 +0000107
Greg Ward91c488c1999-03-29 18:01:49 +0000108 # 'linebuf' is a stack of lines that will be emptied before we
109 # actually read from the file; it's only populated by an
110 # 'unreadline()' operation
111 self.linebuf = []
Fred Drakeb94b8492001-12-06 20:51:35 +0000112
Collin Winter5b7e9d72007-08-30 03:52:21 +0000113 def open(self, filename):
Greg Ward274ad9d1999-09-29 13:03:32 +0000114 """Open a new file named 'filename'. This overrides both the
115 'filename' and 'file' arguments to the constructor."""
Greg Ward782cdfe1999-03-23 14:00:06 +0000116 self.filename = filename
Victor Stinner75d8c5c2010-10-23 17:02:31 +0000117 self.file = io.open(self.filename, 'r', errors=self.errors)
Greg Wardd1dc4751999-01-13 16:12:04 +0000118 self.current_line = 0
119
Collin Winter5b7e9d72007-08-30 03:52:21 +0000120 def close(self):
Greg Ward274ad9d1999-09-29 13:03:32 +0000121 """Close the current file and forget everything we know about it
122 (filename, current line number)."""
Collin Winter5b7e9d72007-08-30 03:52:21 +0000123 self.file.close()
Greg Wardd1dc4751999-01-13 16:12:04 +0000124 self.file = None
125 self.filename = None
126 self.current_line = None
127
Collin Winter5b7e9d72007-08-30 03:52:21 +0000128 def gen_error(self, msg, line=None):
Greg Wardf11296b2000-09-16 18:06:31 +0000129 outmsg = []
130 if line is None:
131 line = self.current_line
132 outmsg.append(self.filename + ", ")
Collin Winter5b7e9d72007-08-30 03:52:21 +0000133 if isinstance(line, (list, tuple)):
134 outmsg.append("lines %d-%d: " % tuple(line))
Greg Wardf11296b2000-09-16 18:06:31 +0000135 else:
136 outmsg.append("line %d: " % line)
137 outmsg.append(str(msg))
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000138 return "".join(outmsg)
Greg Wardf11296b2000-09-16 18:06:31 +0000139
Collin Winter5b7e9d72007-08-30 03:52:21 +0000140 def error(self, msg, line=None):
141 raise ValueError("error: " + self.gen_error(msg, line))
Greg Wardf11296b2000-09-16 18:06:31 +0000142
Collin Winter5b7e9d72007-08-30 03:52:21 +0000143 def warn(self, msg, line=None):
Greg Ward274ad9d1999-09-29 13:03:32 +0000144 """Print (to stderr) a warning message tied to the current logical
145 line in the current file. If the current logical line in the
146 file spans multiple physical lines, the warning refers to the
147 whole range, eg. "lines 3-5". If 'line' supplied, it overrides
148 the current line number; it may be a list or tuple to indicate a
149 range of physical lines, or an integer for a single physical
150 line."""
Greg Wardf11296b2000-09-16 18:06:31 +0000151 sys.stderr.write("warning: " + self.gen_error(msg, line) + "\n")
Greg Wardf6cdcd51999-01-18 17:08:16 +0000152
Collin Winter5b7e9d72007-08-30 03:52:21 +0000153 def readline(self):
Greg Ward274ad9d1999-09-29 13:03:32 +0000154 """Read and return a single logical line from the current file (or
155 from an internal buffer if lines have previously been "unread"
156 with 'unreadline()'). If the 'join_lines' option is true, this
157 may involve reading multiple physical lines concatenated into a
158 single string. Updates the current line number, so calling
159 'warn()' after 'readline()' emits a warning about the physical
160 line(s) just read. Returns None on end-of-file, since the empty
161 string can occur if 'rstrip_ws' is true but 'strip_blanks' is
162 not."""
Greg Ward91c488c1999-03-29 18:01:49 +0000163 # If any "unread" lines waiting in 'linebuf', return the top
164 # one. (We don't actually buffer read-ahead data -- lines only
165 # get put in 'linebuf' if the client explicitly does an
166 # 'unreadline()'.
167 if self.linebuf:
168 line = self.linebuf[-1]
169 del self.linebuf[-1]
170 return line
171
Greg Wardd1dc4751999-01-13 16:12:04 +0000172 buildup_line = ''
173
Collin Winter5b7e9d72007-08-30 03:52:21 +0000174 while True:
Greg Wardabc2f961999-08-10 20:09:38 +0000175 # read the line, make it None if EOF
Greg Wardd1dc4751999-01-13 16:12:04 +0000176 line = self.file.readline()
Collin Winter5b7e9d72007-08-30 03:52:21 +0000177 if line == '':
178 line = None
Greg Wardabc2f961999-08-10 20:09:38 +0000179
Greg Wardd1dc4751999-01-13 16:12:04 +0000180 if self.strip_comments and line:
Greg Wardabc2f961999-08-10 20:09:38 +0000181
182 # Look for the first "#" in the line. If none, never
183 # mind. If we find one and it's the first character, or
184 # is not preceded by "\", then it starts a comment --
185 # strip the comment, strip whitespace before it, and
186 # carry on. Otherwise, it's just an escaped "#", so
187 # unescape it (and any other escaped "#"'s that might be
188 # lurking in there) and otherwise leave the line alone.
189
Collin Winter5b7e9d72007-08-30 03:52:21 +0000190 pos = line.find("#")
191 if pos == -1: # no "#" -- no comments
Greg Wardabc2f961999-08-10 20:09:38 +0000192 pass
Greg Wardacff0b32000-09-16 18:33:36 +0000193
194 # It's definitely a comment -- either "#" is the first
195 # character, or it's elsewhere and unescaped.
196 elif pos == 0 or line[pos-1] != "\\":
Greg Ward274ad9d1999-09-29 13:03:32 +0000197 # Have to preserve the trailing newline, because it's
198 # the job of a later step (rstrip_ws) to remove it --
199 # and if rstrip_ws is false, we'd better preserve it!
200 # (NB. this means that if the final line is all comment
201 # and has no trailing newline, we will think that it's
Greg Wardabc2f961999-08-10 20:09:38 +0000202 # EOF; I think that's OK.)
Greg Ward274ad9d1999-09-29 13:03:32 +0000203 eol = (line[-1] == '\n') and '\n' or ''
204 line = line[0:pos] + eol
Fred Drakeb94b8492001-12-06 20:51:35 +0000205
Greg Wardacff0b32000-09-16 18:33:36 +0000206 # If all that's left is whitespace, then skip line
207 # *now*, before we try to join it to 'buildup_line' --
208 # that way constructs like
209 # hello \\
210 # # comment that should be ignored
211 # there
212 # result in "hello there".
Collin Winter5b7e9d72007-08-30 03:52:21 +0000213 if line.strip() == "":
Greg Wardacff0b32000-09-16 18:33:36 +0000214 continue
Collin Winter5b7e9d72007-08-30 03:52:21 +0000215 else: # it's an escaped "#"
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000216 line = line.replace("\\#", "#")
Fred Drakeb94b8492001-12-06 20:51:35 +0000217
Greg Wardd1dc4751999-01-13 16:12:04 +0000218 # did previous line end with a backslash? then accumulate
219 if self.join_lines and buildup_line:
220 # oops: end of file
Greg Wardabc2f961999-08-10 20:09:38 +0000221 if line is None:
Collin Winter5b7e9d72007-08-30 03:52:21 +0000222 self.warn("continuation line immediately precedes "
223 "end-of-file")
Greg Wardd1dc4751999-01-13 16:12:04 +0000224 return buildup_line
225
Greg Ward60cd2862000-09-16 18:04:55 +0000226 if self.collapse_join:
Collin Winter5b7e9d72007-08-30 03:52:21 +0000227 line = line.lstrip()
Greg Wardd1dc4751999-01-13 16:12:04 +0000228 line = buildup_line + line
229
230 # careful: pay attention to line number when incrementing it
Collin Winter5b7e9d72007-08-30 03:52:21 +0000231 if isinstance(self.current_line, list):
Greg Wardd1dc4751999-01-13 16:12:04 +0000232 self.current_line[1] = self.current_line[1] + 1
233 else:
Greg Wardacff0b32000-09-16 18:33:36 +0000234 self.current_line = [self.current_line,
Collin Winter5b7e9d72007-08-30 03:52:21 +0000235 self.current_line + 1]
Greg Wardd1dc4751999-01-13 16:12:04 +0000236 # just an ordinary line, read it as usual
237 else:
Collin Winter5b7e9d72007-08-30 03:52:21 +0000238 if line is None: # eof
Greg Wardd1dc4751999-01-13 16:12:04 +0000239 return None
240
241 # still have to be careful about incrementing the line number!
Collin Winter5b7e9d72007-08-30 03:52:21 +0000242 if isinstance(self.current_line, list):
Greg Wardd1dc4751999-01-13 16:12:04 +0000243 self.current_line = self.current_line[1] + 1
244 else:
245 self.current_line = self.current_line + 1
Fred Drakeb94b8492001-12-06 20:51:35 +0000246
Greg Wardd1dc4751999-01-13 16:12:04 +0000247 # strip whitespace however the client wants (leading and
248 # trailing, or one or the other, or neither)
249 if self.lstrip_ws and self.rstrip_ws:
Collin Winter5b7e9d72007-08-30 03:52:21 +0000250 line = line.strip()
Greg Ward274ad9d1999-09-29 13:03:32 +0000251 elif self.lstrip_ws:
Collin Winter5b7e9d72007-08-30 03:52:21 +0000252 line = line.lstrip()
Greg Ward274ad9d1999-09-29 13:03:32 +0000253 elif self.rstrip_ws:
Collin Winter5b7e9d72007-08-30 03:52:21 +0000254 line = line.rstrip()
Greg Wardd1dc4751999-01-13 16:12:04 +0000255
256 # blank line (whether we rstrip'ed or not)? skip to next line
257 # if appropriate
Greg Ward3d05c162000-09-16 18:09:22 +0000258 if (line == '' or line == '\n') and self.skip_blanks:
Greg Wardd1dc4751999-01-13 16:12:04 +0000259 continue
260
261 if self.join_lines:
262 if line[-1] == '\\':
263 buildup_line = line[:-1]
264 continue
265
266 if line[-2:] == '\\\n':
267 buildup_line = line[0:-2] + '\n'
268 continue
269
Greg Wardd1dc4751999-01-13 16:12:04 +0000270 # well, I guess there's some actual content there: return it
271 return line
272
Collin Winter5b7e9d72007-08-30 03:52:21 +0000273 def readlines(self):
Greg Ward274ad9d1999-09-29 13:03:32 +0000274 """Read and return the list of all logical lines remaining in the
275 current file."""
Greg Wardd1dc4751999-01-13 16:12:04 +0000276 lines = []
Collin Winter5b7e9d72007-08-30 03:52:21 +0000277 while True:
Greg Wardd1dc4751999-01-13 16:12:04 +0000278 line = self.readline()
279 if line is None:
280 return lines
Collin Winter5b7e9d72007-08-30 03:52:21 +0000281 lines.append(line)
Greg Wardd1dc4751999-01-13 16:12:04 +0000282
Collin Winter5b7e9d72007-08-30 03:52:21 +0000283 def unreadline(self, line):
Greg Ward274ad9d1999-09-29 13:03:32 +0000284 """Push 'line' (a string) onto an internal buffer that will be
285 checked by future 'readline()' calls. Handy for implementing
286 a parser with line-at-a-time lookahead."""
Collin Winter5b7e9d72007-08-30 03:52:21 +0000287 self.linebuf.append(line)