Daniel Dunbar | be7ada7 | 2009-09-08 05:31:18 +0000 | [diff] [blame] | 1 | import itertools |
| 2 | |
| 3 | import Util |
| 4 | from ShCommands import Command, Pipeline, Seq |
| 5 | |
| 6 | class ShLexer: |
| 7 | def __init__(self, data, win32Escapes = False): |
| 8 | self.data = data |
| 9 | self.pos = 0 |
| 10 | self.end = len(data) |
| 11 | self.win32Escapes = win32Escapes |
| 12 | |
| 13 | def eat(self): |
| 14 | c = self.data[self.pos] |
| 15 | self.pos += 1 |
| 16 | return c |
| 17 | |
| 18 | def look(self): |
| 19 | return self.data[self.pos] |
| 20 | |
| 21 | def maybe_eat(self, c): |
| 22 | """ |
| 23 | maybe_eat(c) - Consume the character c if it is the next character, |
| 24 | returning True if a character was consumed. """ |
| 25 | if self.data[self.pos] == c: |
| 26 | self.pos += 1 |
| 27 | return True |
| 28 | return False |
| 29 | |
| 30 | def lex_arg_fast(self, c): |
| 31 | # Get the leading whitespace free section. |
| 32 | chunk = self.data[self.pos - 1:].split(None, 1)[0] |
| 33 | |
| 34 | # If it has special characters, the fast path failed. |
| 35 | if ('|' in chunk or '&' in chunk or |
| 36 | '<' in chunk or '>' in chunk or |
| 37 | "'" in chunk or '"' in chunk or |
| 38 | '\\' in chunk): |
| 39 | return None |
| 40 | |
| 41 | self.pos = self.pos - 1 + len(chunk) |
| 42 | return chunk |
| 43 | |
| 44 | def lex_arg_slow(self, c): |
| 45 | if c in "'\"": |
| 46 | str = self.lex_arg_quoted(c) |
| 47 | else: |
| 48 | str = c |
| 49 | while self.pos != self.end: |
| 50 | c = self.look() |
| 51 | if c.isspace() or c in "|&": |
| 52 | break |
| 53 | elif c in '><': |
| 54 | # This is an annoying case; we treat '2>' as a single token so |
| 55 | # we don't have to track whitespace tokens. |
| 56 | |
| 57 | # If the parse string isn't an integer, do the usual thing. |
| 58 | if not str.isdigit(): |
| 59 | break |
| 60 | |
| 61 | # Otherwise, lex the operator and convert to a redirection |
| 62 | # token. |
| 63 | num = int(str) |
| 64 | tok = self.lex_one_token() |
| 65 | assert isinstance(tok, tuple) and len(tok) == 1 |
| 66 | return (tok[0], num) |
| 67 | elif c == '"': |
| 68 | self.eat() |
| 69 | str += self.lex_arg_quoted('"')
|
| 70 | elif not self.win32Escapes and c == '\\': |
| 71 | # Outside of a string, '\\' escapes everything. |
| 72 | self.eat() |
| 73 | if self.pos == self.end: |
| 74 | Util.warning("escape at end of quoted argument in: %r" % |
| 75 | self.data) |
| 76 | return str |
| 77 | str += self.eat() |
| 78 | else: |
| 79 | str += self.eat() |
| 80 | return str |
| 81 | |
| 82 | def lex_arg_quoted(self, delim): |
| 83 | str = '' |
| 84 | while self.pos != self.end: |
| 85 | c = self.eat() |
| 86 | if c == delim: |
| 87 | return str |
| 88 | elif c == '\\' and delim == '"': |
| 89 | # Inside a '"' quoted string, '\\' only escapes the quote |
| 90 | # character and backslash, otherwise it is preserved. |
| 91 | if self.pos == self.end: |
| 92 | Util.warning("escape at end of quoted argument in: %r" % |
| 93 | self.data) |
| 94 | return str |
| 95 | c = self.eat() |
| 96 | if c == '"': # |
| 97 | str += '"' |
| 98 | elif c == '\\': |
| 99 | str += '\\' |
| 100 | else: |
| 101 | str += '\\' + c |
| 102 | else: |
| 103 | str += c |
| 104 | Util.warning("missing quote character in %r" % self.data) |
| 105 | return str |
| 106 | |
| 107 | def lex_arg_checked(self, c): |
| 108 | pos = self.pos |
| 109 | res = self.lex_arg_fast(c) |
| 110 | end = self.pos |
| 111 | |
| 112 | self.pos = pos |
| 113 | reference = self.lex_arg_slow(c) |
| 114 | if res is not None: |
| 115 | if res != reference: |
| 116 | raise ValueError,"Fast path failure: %r != %r" % (res, reference) |
| 117 | if self.pos != end: |
| 118 | raise ValueError,"Fast path failure: %r != %r" % (self.pos, end) |
| 119 | return reference |
| 120 | |
| 121 | def lex_arg(self, c): |
| 122 | return self.lex_arg_fast(c) or self.lex_arg_slow(c) |
| 123 | |
| 124 | def lex_one_token(self): |
| 125 | """ |
| 126 | lex_one_token - Lex a single 'sh' token. """ |
| 127 | |
| 128 | c = self.eat() |
| 129 | if c in ';!': |
| 130 | return (c,) |
| 131 | if c == '|': |
| 132 | if self.maybe_eat('|'): |
| 133 | return ('||',) |
| 134 | return (c,) |
| 135 | if c == '&': |
| 136 | if self.maybe_eat('&'): |
| 137 | return ('&&',) |
| 138 | if self.maybe_eat('>'): |
| 139 | return ('&>',) |
| 140 | return (c,) |
| 141 | if c == '>': |
| 142 | if self.maybe_eat('&'): |
| 143 | return ('>&',) |
| 144 | if self.maybe_eat('>'): |
| 145 | return ('>>',) |
| 146 | return (c,) |
| 147 | if c == '<': |
| 148 | if self.maybe_eat('&'): |
| 149 | return ('<&',) |
| 150 | if self.maybe_eat('>'): |
| 151 | return ('<<',) |
| 152 | return (c,) |
| 153 | |
| 154 | return self.lex_arg(c) |
| 155 | |
| 156 | def lex(self): |
| 157 | while self.pos != self.end: |
| 158 | if self.look().isspace(): |
| 159 | self.eat() |
| 160 | else: |
| 161 | yield self.lex_one_token() |
| 162 | |
| 163 | ### |
| 164 | |
| 165 | class ShParser: |
| 166 | def __init__(self, data, win32Escapes = False): |
| 167 | self.data = data |
| 168 | self.tokens = ShLexer(data, win32Escapes = win32Escapes).lex() |
| 169 | |
| 170 | def lex(self): |
| 171 | try: |
| 172 | return self.tokens.next() |
| 173 | except StopIteration: |
| 174 | return None |
| 175 | |
| 176 | def look(self): |
| 177 | next = self.lex() |
| 178 | if next is not None: |
| 179 | self.tokens = itertools.chain([next], self.tokens) |
| 180 | return next |
| 181 | |
| 182 | def parse_command(self): |
| 183 | tok = self.lex() |
| 184 | if not tok: |
| 185 | raise ValueError,"empty command!" |
| 186 | if isinstance(tok, tuple): |
| 187 | raise ValueError,"syntax error near unexpected token %r" % tok[0] |
| 188 | |
| 189 | args = [tok] |
| 190 | redirects = [] |
| 191 | while 1: |
| 192 | tok = self.look() |
| 193 | |
| 194 | # EOF? |
| 195 | if tok is None: |
| 196 | break |
| 197 | |
| 198 | # If this is an argument, just add it to the current command. |
| 199 | if isinstance(tok, str): |
| 200 | args.append(self.lex()) |
| 201 | continue |
| 202 | |
| 203 | # Otherwise see if it is a terminator. |
| 204 | assert isinstance(tok, tuple) |
| 205 | if tok[0] in ('|',';','&','||','&&'): |
| 206 | break |
| 207 | |
| 208 | # Otherwise it must be a redirection. |
| 209 | op = self.lex() |
| 210 | arg = self.lex() |
| 211 | if not arg: |
| 212 | raise ValueError,"syntax error near token %r" % op[0] |
| 213 | redirects.append((op, arg)) |
| 214 | |
| 215 | return Command(args, redirects) |
| 216 | |
| 217 | def parse_pipeline(self): |
| 218 | negate = False |
| 219 | if self.look() == ('!',): |
| 220 | self.lex() |
| 221 | negate = True |
| 222 | |
| 223 | commands = [self.parse_command()] |
| 224 | while self.look() == ('|',): |
| 225 | self.lex() |
| 226 | commands.append(self.parse_command()) |
| 227 | return Pipeline(commands, negate) |
| 228 | |
| 229 | def parse(self): |
| 230 | lhs = self.parse_pipeline() |
| 231 | |
| 232 | while self.look(): |
| 233 | operator = self.lex() |
| 234 | assert isinstance(operator, tuple) and len(operator) == 1 |
| 235 | |
| 236 | if not self.look(): |
| 237 | raise ValueError, "missing argument to operator %r" % operator[0] |
| 238 | |
| 239 | # FIXME: Operator precedence!! |
| 240 | lhs = Seq(lhs, operator[0], self.parse_pipeline()) |
| 241 | |
| 242 | return lhs |
| 243 | |
| 244 | ### |
| 245 | |
| 246 | import unittest |
| 247 | |
| 248 | class TestShLexer(unittest.TestCase): |
| 249 | def lex(self, str, *args, **kwargs): |
| 250 | return list(ShLexer(str, *args, **kwargs).lex()) |
| 251 | |
| 252 | def test_basic(self): |
| 253 | self.assertEqual(self.lex('a|b>c&d<e'), |
| 254 | ['a', ('|',), 'b', ('>',), 'c', ('&',), 'd', |
| 255 | ('<',), 'e']) |
| 256 | |
| 257 | def test_redirection_tokens(self): |
| 258 | self.assertEqual(self.lex('a2>c'), |
| 259 | ['a2', ('>',), 'c']) |
| 260 | self.assertEqual(self.lex('a 2>c'), |
| 261 | ['a', ('>',2), 'c']) |
| 262 | |
| 263 | def test_quoting(self): |
| 264 | self.assertEqual(self.lex(""" 'a' """), |
| 265 | ['a']) |
| 266 | self.assertEqual(self.lex(""" "hello\\"world" """), |
| 267 | ['hello"world']) |
| 268 | self.assertEqual(self.lex(""" "hello\\'world" """), |
| 269 | ["hello\\'world"]) |
| 270 | self.assertEqual(self.lex(""" "hello\\\\world" """), |
| 271 | ["hello\\world"]) |
| 272 | self.assertEqual(self.lex(""" he"llo wo"rld """), |
| 273 | ["hello world"]) |
| 274 | self.assertEqual(self.lex(""" a\\ b a\\\\b """), |
| 275 | ["a b", "a\\b"]) |
| 276 | self.assertEqual(self.lex(""" "" "" """), |
| 277 | ["", ""]) |
| 278 | self.assertEqual(self.lex(""" a\\ b """, win32Escapes = True), |
| 279 | ['a\\', 'b']) |
| 280 | |
| 281 | class TestShParse(unittest.TestCase): |
| 282 | def parse(self, str): |
| 283 | return ShParser(str).parse() |
| 284 | |
| 285 | def test_basic(self): |
| 286 | self.assertEqual(self.parse('echo hello'), |
| 287 | Pipeline([Command(['echo', 'hello'], [])], False)) |
| 288 | self.assertEqual(self.parse('echo ""'), |
| 289 | Pipeline([Command(['echo', ''], [])], False)) |
| 290 | |
| 291 | def test_redirection(self): |
| 292 | self.assertEqual(self.parse('echo hello > c'), |
| 293 | Pipeline([Command(['echo', 'hello'], |
| 294 | [((('>'),), 'c')])], False)) |
| 295 | self.assertEqual(self.parse('echo hello > c >> d'), |
| 296 | Pipeline([Command(['echo', 'hello'], [(('>',), 'c'), |
| 297 | (('>>',), 'd')])], False)) |
| 298 | self.assertEqual(self.parse('a 2>&1'), |
| 299 | Pipeline([Command(['a'], [(('>&',2), '1')])], False)) |
| 300 | |
| 301 | def test_pipeline(self): |
| 302 | self.assertEqual(self.parse('a | b'), |
| 303 | Pipeline([Command(['a'], []), |
| 304 | Command(['b'], [])], |
| 305 | False)) |
| 306 | |
| 307 | self.assertEqual(self.parse('a | b | c'), |
| 308 | Pipeline([Command(['a'], []), |
| 309 | Command(['b'], []), |
| 310 | Command(['c'], [])], |
| 311 | False)) |
| 312 | |
| 313 | self.assertEqual(self.parse('! a'), |
| 314 | Pipeline([Command(['a'], [])], |
| 315 | True)) |
| 316 | |
| 317 | def test_list(self): |
| 318 | self.assertEqual(self.parse('a ; b'), |
| 319 | Seq(Pipeline([Command(['a'], [])], False), |
| 320 | ';', |
| 321 | Pipeline([Command(['b'], [])], False))) |
| 322 | |
| 323 | self.assertEqual(self.parse('a & b'), |
| 324 | Seq(Pipeline([Command(['a'], [])], False), |
| 325 | '&', |
| 326 | Pipeline([Command(['b'], [])], False))) |
| 327 | |
| 328 | self.assertEqual(self.parse('a && b'), |
| 329 | Seq(Pipeline([Command(['a'], [])], False), |
| 330 | '&&', |
| 331 | Pipeline([Command(['b'], [])], False))) |
| 332 | |
| 333 | self.assertEqual(self.parse('a || b'), |
| 334 | Seq(Pipeline([Command(['a'], [])], False), |
| 335 | '||', |
| 336 | Pipeline([Command(['b'], [])], False))) |
| 337 | |
| 338 | self.assertEqual(self.parse('a && b || c'), |
| 339 | Seq(Seq(Pipeline([Command(['a'], [])], False), |
| 340 | '&&', |
| 341 | Pipeline([Command(['b'], [])], False)), |
| 342 | '||', |
| 343 | Pipeline([Command(['c'], [])], False))) |
| 344 | |
| 345 | if __name__ == '__main__': |
| 346 | unittest.main() |