blob: 40b8484a685d153a7c20f04fc3bcb6a1adb2b3f3 [file] [log] [blame]
Greg Wardd1dc4751999-01-13 16:12:04 +00001"""text_file
2
3provides the TextFile class, which gives an interface to text files
4that (optionally) takes care of stripping comments, ignoring blank
5lines, and joining lines with backslashes."""
6
Tarek Ziadé36797272010-07-22 12:50:05 +00007import sys, os, io
8
Greg Wardd1dc4751999-01-13 16:12:04 +00009
10class TextFile:
Greg Ward274ad9d1999-09-29 13:03:32 +000011 """Provides a file-like object that takes care of all the things you
12 commonly want to do when processing a text file that has some
Greg Ward60cd2862000-09-16 18:04:55 +000013 line-by-line syntax: strip comments (as long as "#" is your
14 comment character), skip blank lines, join adjacent lines by
15 escaping the newline (ie. backslash at end of line), strip
16 leading and/or trailing whitespace. All of these are optional
17 and independently controllable.
Greg Ward274ad9d1999-09-29 13:03:32 +000018
19 Provides a 'warn()' method so you can generate warning messages that
20 report physical line number, even if the logical line in question
21 spans multiple physical lines. Also provides 'unreadline()' for
22 implementing line-at-a-time lookahead.
23
24 Constructor is called as:
25
26 TextFile (filename=None, file=None, **options)
27
28 It bombs (RuntimeError) if both 'filename' and 'file' are None;
29 'filename' should be a string, and 'file' a file object (or
30 something that provides 'readline()' and 'close()' methods). It is
31 recommended that you supply at least 'filename', so that TextFile
32 can include it in warning messages. If 'file' is not supplied,
Guido van Rossum63236cf2007-05-25 18:39:29 +000033 TextFile creates its own using 'io.open()'.
Greg Ward274ad9d1999-09-29 13:03:32 +000034
35 The options are all boolean, and affect the value returned by
36 'readline()':
37 strip_comments [default: true]
38 strip from "#" to end-of-line, as well as any whitespace
39 leading up to the "#" -- unless it is escaped by a backslash
40 lstrip_ws [default: false]
41 strip leading whitespace from each line before returning it
42 rstrip_ws [default: true]
43 strip trailing whitespace (including line terminator!) from
44 each line before returning it
45 skip_blanks [default: true}
46 skip lines that are empty *after* stripping comments and
Greg Ward60cd2862000-09-16 18:04:55 +000047 whitespace. (If both lstrip_ws and rstrip_ws are false,
Greg Ward274ad9d1999-09-29 13:03:32 +000048 then some lines may consist of solely whitespace: these will
49 *not* be skipped, even if 'skip_blanks' is true.)
50 join_lines [default: false]
51 if a backslash is the last non-newline character on a line
52 after stripping comments and whitespace, join the following line
53 to it to form one "logical line"; if N consecutive lines end
54 with a backslash, then N+1 physical lines will be joined to
55 form one logical line.
Greg Ward60cd2862000-09-16 18:04:55 +000056 collapse_join [default: false]
57 strip leading whitespace from lines that are joined to their
58 predecessor; only matters if (join_lines and not lstrip_ws)
Victor Stinner75d8c5c2010-10-23 17:02:31 +000059 errors [default: 'strict']
60 error handler used to decode the file content
Greg Ward274ad9d1999-09-29 13:03:32 +000061
62 Note that since 'rstrip_ws' can strip the trailing newline, the
63 semantics of 'readline()' must differ from those of the builtin file
64 object's 'readline()' method! In particular, 'readline()' returns
65 None for end-of-file: an empty string might just be a blank line (or
66 an all-whitespace line), if 'rstrip_ws' is true but 'skip_blanks' is
67 not."""
68
Greg Wardd1dc4751999-01-13 16:12:04 +000069 default_options = { 'strip_comments': 1,
Greg Wardd1dc4751999-01-13 16:12:04 +000070 'skip_blanks': 1,
Greg Wardd1dc4751999-01-13 16:12:04 +000071 'lstrip_ws': 0,
72 'rstrip_ws': 1,
Greg Ward60cd2862000-09-16 18:04:55 +000073 'join_lines': 0,
74 'collapse_join': 0,
Victor Stinner75d8c5c2010-10-23 17:02:31 +000075 'errors': 'strict',
Greg Wardd1dc4751999-01-13 16:12:04 +000076 }
77
Collin Winter5b7e9d72007-08-30 03:52:21 +000078 def __init__(self, filename=None, file=None, **options):
Greg Ward274ad9d1999-09-29 13:03:32 +000079 """Construct a new TextFile object. At least one of 'filename'
80 (a string) and 'file' (a file-like object) must be supplied.
81 They keyword argument options are described above and affect
82 the values returned by 'readline()'."""
Greg Ward782cdfe1999-03-23 14:00:06 +000083 if filename is None and file is None:
Collin Winter5b7e9d72007-08-30 03:52:21 +000084 raise RuntimeError("you must supply either or both of 'filename' and 'file'")
Greg Wardd1dc4751999-01-13 16:12:04 +000085
86 # set values for all options -- either from client option hash
87 # or fallback to default_options
88 for opt in self.default_options.keys():
Guido van Rossume2b70bc2006-08-18 22:13:04 +000089 if opt in options:
Collin Winter5b7e9d72007-08-30 03:52:21 +000090 setattr(self, opt, options[opt])
Greg Wardd1dc4751999-01-13 16:12:04 +000091 else:
Collin Winter5b7e9d72007-08-30 03:52:21 +000092 setattr(self, opt, self.default_options[opt])
Greg Wardd1dc4751999-01-13 16:12:04 +000093
94 # sanity check client option hash
95 for opt in options.keys():
Guido van Rossume2b70bc2006-08-18 22:13:04 +000096 if opt not in self.default_options:
Collin Winter5b7e9d72007-08-30 03:52:21 +000097 raise KeyError("invalid TextFile option '%s'" % opt)
Greg Wardd1dc4751999-01-13 16:12:04 +000098
Greg Ward782cdfe1999-03-23 14:00:06 +000099 if file is None:
Collin Winter5b7e9d72007-08-30 03:52:21 +0000100 self.open(filename)
Greg Ward782cdfe1999-03-23 14:00:06 +0000101 else:
102 self.filename = filename
103 self.file = file
104 self.current_line = 0 # assuming that file is at BOF!
Greg Ward787451b1999-03-26 21:48:59 +0000105
Greg Ward91c488c1999-03-29 18:01:49 +0000106 # 'linebuf' is a stack of lines that will be emptied before we
107 # actually read from the file; it's only populated by an
108 # 'unreadline()' operation
109 self.linebuf = []
Fred Drakeb94b8492001-12-06 20:51:35 +0000110
Collin Winter5b7e9d72007-08-30 03:52:21 +0000111 def open(self, filename):
Greg Ward274ad9d1999-09-29 13:03:32 +0000112 """Open a new file named 'filename'. This overrides both the
113 'filename' and 'file' arguments to the constructor."""
Greg Ward782cdfe1999-03-23 14:00:06 +0000114 self.filename = filename
Victor Stinner75d8c5c2010-10-23 17:02:31 +0000115 self.file = io.open(self.filename, 'r', errors=self.errors)
Greg Wardd1dc4751999-01-13 16:12:04 +0000116 self.current_line = 0
117
Collin Winter5b7e9d72007-08-30 03:52:21 +0000118 def close(self):
Greg Ward274ad9d1999-09-29 13:03:32 +0000119 """Close the current file and forget everything we know about it
120 (filename, current line number)."""
Collin Winter5b7e9d72007-08-30 03:52:21 +0000121 self.file.close()
Greg Wardd1dc4751999-01-13 16:12:04 +0000122 self.file = None
123 self.filename = None
124 self.current_line = None
125
Collin Winter5b7e9d72007-08-30 03:52:21 +0000126 def gen_error(self, msg, line=None):
Greg Wardf11296b2000-09-16 18:06:31 +0000127 outmsg = []
128 if line is None:
129 line = self.current_line
130 outmsg.append(self.filename + ", ")
Collin Winter5b7e9d72007-08-30 03:52:21 +0000131 if isinstance(line, (list, tuple)):
132 outmsg.append("lines %d-%d: " % tuple(line))
Greg Wardf11296b2000-09-16 18:06:31 +0000133 else:
134 outmsg.append("line %d: " % line)
135 outmsg.append(str(msg))
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000136 return "".join(outmsg)
Greg Wardf11296b2000-09-16 18:06:31 +0000137
Collin Winter5b7e9d72007-08-30 03:52:21 +0000138 def error(self, msg, line=None):
139 raise ValueError("error: " + self.gen_error(msg, line))
Greg Wardf11296b2000-09-16 18:06:31 +0000140
Collin Winter5b7e9d72007-08-30 03:52:21 +0000141 def warn(self, msg, line=None):
Greg Ward274ad9d1999-09-29 13:03:32 +0000142 """Print (to stderr) a warning message tied to the current logical
143 line in the current file. If the current logical line in the
144 file spans multiple physical lines, the warning refers to the
145 whole range, eg. "lines 3-5". If 'line' supplied, it overrides
146 the current line number; it may be a list or tuple to indicate a
147 range of physical lines, or an integer for a single physical
148 line."""
Greg Wardf11296b2000-09-16 18:06:31 +0000149 sys.stderr.write("warning: " + self.gen_error(msg, line) + "\n")
Greg Wardf6cdcd51999-01-18 17:08:16 +0000150
Collin Winter5b7e9d72007-08-30 03:52:21 +0000151 def readline(self):
Greg Ward274ad9d1999-09-29 13:03:32 +0000152 """Read and return a single logical line from the current file (or
153 from an internal buffer if lines have previously been "unread"
154 with 'unreadline()'). If the 'join_lines' option is true, this
155 may involve reading multiple physical lines concatenated into a
156 single string. Updates the current line number, so calling
157 'warn()' after 'readline()' emits a warning about the physical
158 line(s) just read. Returns None on end-of-file, since the empty
159 string can occur if 'rstrip_ws' is true but 'strip_blanks' is
160 not."""
Greg Ward91c488c1999-03-29 18:01:49 +0000161 # If any "unread" lines waiting in 'linebuf', return the top
162 # one. (We don't actually buffer read-ahead data -- lines only
163 # get put in 'linebuf' if the client explicitly does an
164 # 'unreadline()'.
165 if self.linebuf:
166 line = self.linebuf[-1]
167 del self.linebuf[-1]
168 return line
169
Greg Wardd1dc4751999-01-13 16:12:04 +0000170 buildup_line = ''
171
Collin Winter5b7e9d72007-08-30 03:52:21 +0000172 while True:
Greg Wardabc2f961999-08-10 20:09:38 +0000173 # read the line, make it None if EOF
Greg Wardd1dc4751999-01-13 16:12:04 +0000174 line = self.file.readline()
Collin Winter5b7e9d72007-08-30 03:52:21 +0000175 if line == '':
176 line = None
Greg Wardabc2f961999-08-10 20:09:38 +0000177
Greg Wardd1dc4751999-01-13 16:12:04 +0000178 if self.strip_comments and line:
Greg Wardabc2f961999-08-10 20:09:38 +0000179
180 # Look for the first "#" in the line. If none, never
181 # mind. If we find one and it's the first character, or
182 # is not preceded by "\", then it starts a comment --
183 # strip the comment, strip whitespace before it, and
184 # carry on. Otherwise, it's just an escaped "#", so
185 # unescape it (and any other escaped "#"'s that might be
186 # lurking in there) and otherwise leave the line alone.
187
Collin Winter5b7e9d72007-08-30 03:52:21 +0000188 pos = line.find("#")
189 if pos == -1: # no "#" -- no comments
Greg Wardabc2f961999-08-10 20:09:38 +0000190 pass
Greg Wardacff0b32000-09-16 18:33:36 +0000191
192 # It's definitely a comment -- either "#" is the first
193 # character, or it's elsewhere and unescaped.
194 elif pos == 0 or line[pos-1] != "\\":
Greg Ward274ad9d1999-09-29 13:03:32 +0000195 # Have to preserve the trailing newline, because it's
196 # the job of a later step (rstrip_ws) to remove it --
197 # and if rstrip_ws is false, we'd better preserve it!
198 # (NB. this means that if the final line is all comment
199 # and has no trailing newline, we will think that it's
Greg Wardabc2f961999-08-10 20:09:38 +0000200 # EOF; I think that's OK.)
Greg Ward274ad9d1999-09-29 13:03:32 +0000201 eol = (line[-1] == '\n') and '\n' or ''
202 line = line[0:pos] + eol
Fred Drakeb94b8492001-12-06 20:51:35 +0000203
Greg Wardacff0b32000-09-16 18:33:36 +0000204 # If all that's left is whitespace, then skip line
205 # *now*, before we try to join it to 'buildup_line' --
206 # that way constructs like
207 # hello \\
208 # # comment that should be ignored
209 # there
210 # result in "hello there".
Collin Winter5b7e9d72007-08-30 03:52:21 +0000211 if line.strip() == "":
Greg Wardacff0b32000-09-16 18:33:36 +0000212 continue
Collin Winter5b7e9d72007-08-30 03:52:21 +0000213 else: # it's an escaped "#"
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000214 line = line.replace("\\#", "#")
Fred Drakeb94b8492001-12-06 20:51:35 +0000215
Greg Wardd1dc4751999-01-13 16:12:04 +0000216 # did previous line end with a backslash? then accumulate
217 if self.join_lines and buildup_line:
218 # oops: end of file
Greg Wardabc2f961999-08-10 20:09:38 +0000219 if line is None:
Collin Winter5b7e9d72007-08-30 03:52:21 +0000220 self.warn("continuation line immediately precedes "
221 "end-of-file")
Greg Wardd1dc4751999-01-13 16:12:04 +0000222 return buildup_line
223
Greg Ward60cd2862000-09-16 18:04:55 +0000224 if self.collapse_join:
Collin Winter5b7e9d72007-08-30 03:52:21 +0000225 line = line.lstrip()
Greg Wardd1dc4751999-01-13 16:12:04 +0000226 line = buildup_line + line
227
228 # careful: pay attention to line number when incrementing it
Collin Winter5b7e9d72007-08-30 03:52:21 +0000229 if isinstance(self.current_line, list):
Greg Wardd1dc4751999-01-13 16:12:04 +0000230 self.current_line[1] = self.current_line[1] + 1
231 else:
Greg Wardacff0b32000-09-16 18:33:36 +0000232 self.current_line = [self.current_line,
Collin Winter5b7e9d72007-08-30 03:52:21 +0000233 self.current_line + 1]
Greg Wardd1dc4751999-01-13 16:12:04 +0000234 # just an ordinary line, read it as usual
235 else:
Collin Winter5b7e9d72007-08-30 03:52:21 +0000236 if line is None: # eof
Greg Wardd1dc4751999-01-13 16:12:04 +0000237 return None
238
239 # still have to be careful about incrementing the line number!
Collin Winter5b7e9d72007-08-30 03:52:21 +0000240 if isinstance(self.current_line, list):
Greg Wardd1dc4751999-01-13 16:12:04 +0000241 self.current_line = self.current_line[1] + 1
242 else:
243 self.current_line = self.current_line + 1
Fred Drakeb94b8492001-12-06 20:51:35 +0000244
Greg Wardd1dc4751999-01-13 16:12:04 +0000245 # strip whitespace however the client wants (leading and
246 # trailing, or one or the other, or neither)
247 if self.lstrip_ws and self.rstrip_ws:
Collin Winter5b7e9d72007-08-30 03:52:21 +0000248 line = line.strip()
Greg Ward274ad9d1999-09-29 13:03:32 +0000249 elif self.lstrip_ws:
Collin Winter5b7e9d72007-08-30 03:52:21 +0000250 line = line.lstrip()
Greg Ward274ad9d1999-09-29 13:03:32 +0000251 elif self.rstrip_ws:
Collin Winter5b7e9d72007-08-30 03:52:21 +0000252 line = line.rstrip()
Greg Wardd1dc4751999-01-13 16:12:04 +0000253
254 # blank line (whether we rstrip'ed or not)? skip to next line
255 # if appropriate
Greg Ward3d05c162000-09-16 18:09:22 +0000256 if (line == '' or line == '\n') and self.skip_blanks:
Greg Wardd1dc4751999-01-13 16:12:04 +0000257 continue
258
259 if self.join_lines:
260 if line[-1] == '\\':
261 buildup_line = line[:-1]
262 continue
263
264 if line[-2:] == '\\\n':
265 buildup_line = line[0:-2] + '\n'
266 continue
267
Greg Wardd1dc4751999-01-13 16:12:04 +0000268 # well, I guess there's some actual content there: return it
269 return line
270
Collin Winter5b7e9d72007-08-30 03:52:21 +0000271 def readlines(self):
Greg Ward274ad9d1999-09-29 13:03:32 +0000272 """Read and return the list of all logical lines remaining in the
273 current file."""
Greg Wardd1dc4751999-01-13 16:12:04 +0000274 lines = []
Collin Winter5b7e9d72007-08-30 03:52:21 +0000275 while True:
Greg Wardd1dc4751999-01-13 16:12:04 +0000276 line = self.readline()
277 if line is None:
278 return lines
Collin Winter5b7e9d72007-08-30 03:52:21 +0000279 lines.append(line)
Greg Wardd1dc4751999-01-13 16:12:04 +0000280
Collin Winter5b7e9d72007-08-30 03:52:21 +0000281 def unreadline(self, line):
Greg Ward274ad9d1999-09-29 13:03:32 +0000282 """Push 'line' (a string) onto an internal buffer that will be
283 checked by future 'readline()' calls. Handy for implementing
284 a parser with line-at-a-time lookahead."""
Collin Winter5b7e9d72007-08-30 03:52:21 +0000285 self.linebuf.append(line)