Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 1 | import itertools |
| 2 | |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 3 | import Util |
| 4 | |
Daniel Dunbar | ee41c4d | 2009-08-01 05:52:04 +0000 | [diff] [blame] | 5 | # FIXME: It would be nice to at least match a few other things like `...`, $( |
| 6 | # ... ), $VAR, etc., if only so we can nicely say "we don't support this". |
| 7 | |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 8 | class ShLexer: |
| 9 | def __init__(self, data): |
| 10 | self.data = data |
| 11 | self.pos = 0 |
| 12 | self.end = len(data) |
| 13 | |
| 14 | def eat(self): |
| 15 | c = self.data[self.pos] |
| 16 | self.pos += 1 |
| 17 | return c |
| 18 | |
| 19 | def look(self): |
| 20 | return self.data[self.pos] |
| 21 | |
| 22 | def maybe_eat(self, c): |
| 23 | """ |
| 24 | maybe_eat(c) - Consume the character c if it is the next character, |
| 25 | returning True if a character was consumed. """ |
| 26 | if self.data[self.pos] == c: |
| 27 | self.pos += 1 |
| 28 | return True |
| 29 | return False |
| 30 | |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 31 | def lex_arg_fast(self, c): |
| 32 | # Get the leading whitespace free section. |
| 33 | chunk = self.data[self.pos - 1:].split(None, 1)[0] |
| 34 | |
| 35 | # If it has special characters, the fast path failed. |
| 36 | if ('|' in chunk or '&' in chunk or |
| 37 | '<' in chunk or '>' in chunk or |
| 38 | "'" in chunk or '"' in chunk): |
| 39 | return None |
| 40 | |
| 41 | self.pos = self.pos - 1 + len(chunk) |
| 42 | return chunk |
| 43 | |
| 44 | def lex_arg_slow(self, c): |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 45 | if c in "'\"": |
| 46 | str = self.lex_arg_quoted(c) |
| 47 | else: |
| 48 | str = c |
| 49 | while self.pos != self.end: |
| 50 | c = self.look() |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 51 | if c.isspace() or c in "|&": |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 52 | break |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 53 | elif c in '><': |
| 54 | # This is an annoying case; we treat '2>' as a single token so |
| 55 | # we don't have to track whitespace tokens. |
| 56 | |
| 57 | # If the parse string isn't an integer, do the usual thing. |
| 58 | if not str.isdigit(): |
| 59 | break |
| 60 | |
| 61 | # Otherwise, lex the operator and convert to a redirection |
| 62 | # token. |
| 63 | num = int(str) |
| 64 | tok = self.lex_one_token() |
| 65 | assert isinstance(tok, tuple) and len(tok) == 1 |
| 66 | return (tok[0], num) |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 67 | elif c == '"': |
| 68 | self.eat() |
| 69 | str += self.lex_arg_quoted('"') |
| 70 | else: |
| 71 | str += self.eat() |
| 72 | return str |
| 73 | |
| 74 | def lex_arg_quoted(self, delim): |
| 75 | str = '' |
| 76 | while self.pos != self.end: |
| 77 | c = self.eat() |
| 78 | if c == delim: |
| 79 | return str |
| 80 | elif c == '\\' and delim == '"': |
| 81 | # Shell escaping is just '\"' to avoid termination, no actual |
| 82 | # escaping. |
| 83 | if self.pos == self.end: |
| 84 | Util.warning("escape at end of quoted argument in: %r" % |
| 85 | self.data) |
| 86 | return str |
| 87 | c = self.eat() |
Daniel Dunbar | ee41c4d | 2009-08-01 05:52:04 +0000 | [diff] [blame] | 88 | if c == '"': # |
| 89 | str += '"' |
| 90 | elif c == '\\': |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 91 | str += '\\' |
Daniel Dunbar | ee41c4d | 2009-08-01 05:52:04 +0000 | [diff] [blame] | 92 | else: |
| 93 | str += '\\' + c |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 94 | else: |
| 95 | str += c |
| 96 | Util.warning("missing quote character in %r" % self.data) |
| 97 | return str |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 98 | |
| 99 | def lex_arg_checked(self, c): |
| 100 | pos = self.pos |
| 101 | res = self.lex_arg_fast(c) |
| 102 | end = self.pos |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 103 | |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 104 | self.pos = pos |
| 105 | reference = self.lex_arg_slow(c) |
| 106 | if res is not None: |
| 107 | if res != reference: |
| 108 | raise ValueError,"Fast path failure: %r != %r" % (res, reference) |
| 109 | if self.pos != end: |
| 110 | raise ValueError,"Fast path failure: %r != %r" % (self.pos, end) |
| 111 | return reference |
| 112 | |
| 113 | def lex_arg(self, c): |
| 114 | return self.lex_arg_fast(c) or self.lex_arg_slow(c) |
| 115 | |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 116 | def lex_one_token(self): |
| 117 | """ |
| 118 | lex_one_token - Lex a single 'sh' token. """ |
| 119 | |
| 120 | c = self.eat() |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 121 | if c in ';!': |
| 122 | return (c,) |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 123 | if c == '|': |
| 124 | if self.maybe_eat('|'): |
| 125 | return ('||',) |
| 126 | return (c,) |
| 127 | if c == '&': |
| 128 | if self.maybe_eat('&'): |
| 129 | return ('&&',) |
| 130 | if self.maybe_eat('>'): |
| 131 | return ('&>',) |
| 132 | return (c,) |
| 133 | if c == '>': |
| 134 | if self.maybe_eat('&'): |
| 135 | return ('>&',) |
| 136 | if self.maybe_eat('>'): |
| 137 | return ('>>',) |
| 138 | return (c,) |
| 139 | if c == '<': |
| 140 | if self.maybe_eat('&'): |
| 141 | return ('<&',) |
| 142 | if self.maybe_eat('>'): |
| 143 | return ('<<',) |
Daniel Dunbar | ee41c4d | 2009-08-01 05:52:04 +0000 | [diff] [blame] | 144 | return (c,) |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 145 | |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 146 | return self.lex_arg(c) |
| 147 | |
| 148 | def lex(self): |
| 149 | while self.pos != self.end: |
| 150 | if self.look().isspace(): |
| 151 | self.eat() |
| 152 | else: |
| 153 | yield self.lex_one_token() |
| 154 | |
| 155 | ### |
| 156 | |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 157 | class Command: |
| 158 | def __init__(self, args, redirects): |
| 159 | self.args = list(args) |
| 160 | self.redirects = list(redirects) |
| 161 | |
| 162 | def __repr__(self): |
| 163 | return 'Command(%r, %r)' % (self.args, self.redirects) |
| 164 | |
| 165 | def __cmp__(self, other): |
| 166 | if not isinstance(other, Command): |
| 167 | return -1 |
| 168 | |
| 169 | return cmp((self.args, self.redirects), |
| 170 | (other.args, other.redirects)) |
| 171 | |
| 172 | class Pipeline: |
| 173 | def __init__(self, commands, negate): |
| 174 | self.commands = commands |
| 175 | self.negate = negate |
| 176 | |
| 177 | def __repr__(self): |
| 178 | return 'Pipeline(%r, %r)' % (self.commands, self.negate) |
| 179 | |
| 180 | def __cmp__(self, other): |
| 181 | if not isinstance(other, Pipeline): |
| 182 | return -1 |
| 183 | |
| 184 | return cmp((self.commands, self.negate), |
| 185 | (other.commands, other.negate)) |
| 186 | |
| 187 | class Seq: |
| 188 | def __init__(self, lhs, op, rhs): |
| 189 | assert op in (';', '&', '||', '&&') |
| 190 | self.op = op |
| 191 | self.lhs = lhs |
| 192 | self.rhs = rhs |
| 193 | |
| 194 | def __repr__(self): |
| 195 | return 'Seq(%r, %r, %r)' % (self.lhs, self.op, self.rhs) |
| 196 | |
| 197 | def __cmp__(self, other): |
| 198 | if not isinstance(other, Seq): |
| 199 | return -1 |
| 200 | |
| 201 | return cmp((self.lhs, self.op, self.rhs), |
| 202 | (other.lhs, other.op, other.rhs)) |
| 203 | |
| 204 | class ShParser: |
| 205 | def __init__(self, data): |
| 206 | self.data = data |
| 207 | self.tokens = ShLexer(data).lex() |
| 208 | |
| 209 | def lex(self): |
| 210 | try: |
| 211 | return self.tokens.next() |
| 212 | except StopIteration: |
| 213 | return None |
| 214 | |
| 215 | def look(self): |
| 216 | next = self.lex() |
| 217 | if next: |
| 218 | self.tokens = itertools.chain([next], self.tokens) |
| 219 | return next |
| 220 | |
| 221 | def parse_command(self): |
| 222 | tok = self.lex() |
| 223 | if not tok: |
| 224 | raise ValueError,"empty command!" |
| 225 | if isinstance(tok, tuple): |
| 226 | raise ValueError,"syntax error near unexpected token %r" % tok[0] |
| 227 | |
| 228 | args = [tok] |
| 229 | redirects = [] |
| 230 | while 1: |
| 231 | tok = self.look() |
| 232 | |
| 233 | # EOF? |
| 234 | if tok is None: |
| 235 | break |
| 236 | |
| 237 | # If this is an argument, just add it to the current command. |
| 238 | if isinstance(tok, str): |
| 239 | args.append(self.lex()) |
| 240 | continue |
| 241 | |
| 242 | # Otherwise see if it is a terminator. |
| 243 | assert isinstance(tok, tuple) |
| 244 | if tok[0] in ('|',';','&','||','&&'): |
| 245 | break |
| 246 | |
| 247 | # Otherwise it must be a redirection. |
| 248 | op = self.lex() |
| 249 | arg = self.lex() |
| 250 | if not arg: |
| 251 | raise ValueError,"syntax error near token %r" % op[0] |
| 252 | redirects.append((op, arg)) |
| 253 | |
| 254 | return Command(args, redirects) |
| 255 | |
| 256 | def parse_pipeline(self): |
| 257 | negate = False |
| 258 | if self.look() == ('!',): |
| 259 | self.lex() |
| 260 | negate = True |
| 261 | |
| 262 | commands = [self.parse_command()] |
| 263 | while self.look() == ('|',): |
| 264 | self.lex() |
| 265 | commands.append(self.parse_command()) |
| 266 | return Pipeline(commands, negate) |
| 267 | |
| 268 | def parse(self): |
| 269 | lhs = self.parse_pipeline() |
| 270 | |
| 271 | while self.look(): |
| 272 | operator = self.lex() |
| 273 | assert isinstance(operator, tuple) and len(operator) == 1 |
| 274 | |
| 275 | if not self.look(): |
| 276 | raise ValueError, "missing argument to operator %r" % operator[0] |
| 277 | |
| 278 | # FIXME: Operator precedence!! |
| 279 | lhs = Seq(lhs, operator[0], self.parse_pipeline()) |
| 280 | |
| 281 | return lhs |
| 282 | |
| 283 | ### |
| 284 | |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 285 | import unittest |
| 286 | |
| 287 | class TestShLexer(unittest.TestCase): |
| 288 | def lex(self, str): |
| 289 | return list(ShLexer(str).lex()) |
| 290 | |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 291 | def test_basic(self): |
Daniel Dunbar | ee41c4d | 2009-08-01 05:52:04 +0000 | [diff] [blame] | 292 | self.assertEqual(self.lex('a|b>c&d<e'), |
| 293 | ['a', ('|',), 'b', ('>',), 'c', ('&',), 'd', |
| 294 | ('<',), 'e']) |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 295 | |
| 296 | def test_redirection_tokens(self): |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 297 | self.assertEqual(self.lex('a2>c'), |
| 298 | ['a2', ('>',), 'c']) |
| 299 | self.assertEqual(self.lex('a 2>c'), |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 300 | ['a', ('>',2), 'c']) |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 301 | |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 302 | def test_quoting(self): |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 303 | self.assertEqual(self.lex(""" 'a' """), |
| 304 | ['a']) |
| 305 | self.assertEqual(self.lex(""" "hello\\"world" """), |
| 306 | ['hello"world']) |
| 307 | self.assertEqual(self.lex(""" "hello\\'world" """), |
| 308 | ["hello\\'world"]) |
Daniel Dunbar | ee41c4d | 2009-08-01 05:52:04 +0000 | [diff] [blame] | 309 | self.assertEqual(self.lex(""" "hello\\\\world" """), |
| 310 | ["hello\\world"]) |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 311 | self.assertEqual(self.lex(""" he"llo wo"rld """), |
| 312 | ["hello world"]) |
| 313 | |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 314 | class TestShParse(unittest.TestCase): |
| 315 | def parse(self, str): |
| 316 | return ShParser(str).parse() |
| 317 | |
| 318 | def test_basic(self): |
| 319 | self.assertEqual(self.parse('echo hello'), |
| 320 | Pipeline([Command(['echo', 'hello'], [])], False)) |
| 321 | |
| 322 | def test_redirection(self): |
| 323 | self.assertEqual(self.parse('echo hello > c'), |
| 324 | Pipeline([Command(['echo', 'hello'], |
| 325 | [((('>'),), 'c')])], False)) |
| 326 | self.assertEqual(self.parse('echo hello > c >> d'), |
| 327 | Pipeline([Command(['echo', 'hello'], [(('>',), 'c'), |
| 328 | (('>>',), 'd')])], False)) |
| 329 | |
| 330 | def test_pipeline(self): |
| 331 | self.assertEqual(self.parse('a | b'), |
| 332 | Pipeline([Command(['a'], []), |
| 333 | Command(['b'], [])], |
| 334 | False)) |
| 335 | |
| 336 | self.assertEqual(self.parse('a | b | c'), |
| 337 | Pipeline([Command(['a'], []), |
| 338 | Command(['b'], []), |
| 339 | Command(['c'], [])], |
| 340 | False)) |
| 341 | |
| 342 | self.assertEqual(self.parse('! a'), |
| 343 | Pipeline([Command(['a'], [])], |
| 344 | True)) |
| 345 | |
| 346 | def test_list(self): |
| 347 | self.assertEqual(self.parse('a ; b'), |
| 348 | Seq(Pipeline([Command(['a'], [])], False), |
| 349 | ';', |
| 350 | Pipeline([Command(['b'], [])], False))) |
| 351 | |
| 352 | self.assertEqual(self.parse('a & b'), |
| 353 | Seq(Pipeline([Command(['a'], [])], False), |
| 354 | '&', |
| 355 | Pipeline([Command(['b'], [])], False))) |
| 356 | |
| 357 | self.assertEqual(self.parse('a && b'), |
| 358 | Seq(Pipeline([Command(['a'], [])], False), |
| 359 | '&&', |
| 360 | Pipeline([Command(['b'], [])], False))) |
| 361 | |
| 362 | self.assertEqual(self.parse('a || b'), |
| 363 | Seq(Pipeline([Command(['a'], [])], False), |
| 364 | '||', |
| 365 | Pipeline([Command(['b'], [])], False))) |
| 366 | |
| 367 | self.assertEqual(self.parse('a && b || c'), |
| 368 | Seq(Seq(Pipeline([Command(['a'], [])], False), |
| 369 | '&&', |
| 370 | Pipeline([Command(['b'], [])], False)), |
| 371 | '||', |
| 372 | Pipeline([Command(['c'], [])], False))) |
| 373 | |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 374 | if __name__ == '__main__': |
| 375 | unittest.main() |