Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 1 | import itertools |
| 2 | |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 3 | import Util |
| 4 | |
Daniel Dunbar | ee41c4d | 2009-08-01 05:52:04 +0000 | [diff] [blame] | 5 | # FIXME: It would be nice to at least match a few other things like `...`, $( |
| 6 | # ... ), $VAR, etc., if only so we can nicely say "we don't support this". |
| 7 | |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 8 | class ShLexer: |
| 9 | def __init__(self, data): |
| 10 | self.data = data |
| 11 | self.pos = 0 |
| 12 | self.end = len(data) |
| 13 | |
| 14 | def eat(self): |
| 15 | c = self.data[self.pos] |
| 16 | self.pos += 1 |
| 17 | return c |
| 18 | |
| 19 | def look(self): |
| 20 | return self.data[self.pos] |
| 21 | |
| 22 | def maybe_eat(self, c): |
| 23 | """ |
| 24 | maybe_eat(c) - Consume the character c if it is the next character, |
| 25 | returning True if a character was consumed. """ |
| 26 | if self.data[self.pos] == c: |
| 27 | self.pos += 1 |
| 28 | return True |
| 29 | return False |
| 30 | |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 31 | def lex_arg_fast(self, c): |
| 32 | # Get the leading whitespace free section. |
| 33 | chunk = self.data[self.pos - 1:].split(None, 1)[0] |
| 34 | |
| 35 | # If it has special characters, the fast path failed. |
| 36 | if ('|' in chunk or '&' in chunk or |
| 37 | '<' in chunk or '>' in chunk or |
Daniel Dunbar | a39be6a | 2009-08-01 09:41:09 +0000 | [diff] [blame] | 38 | "'" in chunk or '"' in chunk or |
| 39 | '\\' in chunk): |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 40 | return None |
| 41 | |
| 42 | self.pos = self.pos - 1 + len(chunk) |
| 43 | return chunk |
| 44 | |
| 45 | def lex_arg_slow(self, c): |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 46 | if c in "'\"": |
| 47 | str = self.lex_arg_quoted(c) |
| 48 | else: |
| 49 | str = c |
| 50 | while self.pos != self.end: |
| 51 | c = self.look() |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 52 | if c.isspace() or c in "|&": |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 53 | break |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 54 | elif c in '><': |
| 55 | # This is an annoying case; we treat '2>' as a single token so |
| 56 | # we don't have to track whitespace tokens. |
| 57 | |
| 58 | # If the parse string isn't an integer, do the usual thing. |
| 59 | if not str.isdigit(): |
| 60 | break |
| 61 | |
| 62 | # Otherwise, lex the operator and convert to a redirection |
| 63 | # token. |
| 64 | num = int(str) |
| 65 | tok = self.lex_one_token() |
| 66 | assert isinstance(tok, tuple) and len(tok) == 1 |
| 67 | return (tok[0], num) |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 68 | elif c == '"': |
| 69 | self.eat() |
| 70 | str += self.lex_arg_quoted('"') |
Daniel Dunbar | a39be6a | 2009-08-01 09:41:09 +0000 | [diff] [blame] | 71 | elif c == '\\': |
| 72 | # Outside of a string, '\\' escapes everything. |
| 73 | self.eat() |
| 74 | if self.pos == self.end: |
| 75 | Util.warning("escape at end of quoted argument in: %r" % |
| 76 | self.data) |
| 77 | return str |
| 78 | str += self.eat() |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 79 | else: |
| 80 | str += self.eat() |
| 81 | return str |
| 82 | |
| 83 | def lex_arg_quoted(self, delim): |
| 84 | str = '' |
| 85 | while self.pos != self.end: |
| 86 | c = self.eat() |
| 87 | if c == delim: |
| 88 | return str |
| 89 | elif c == '\\' and delim == '"': |
Daniel Dunbar | a39be6a | 2009-08-01 09:41:09 +0000 | [diff] [blame] | 90 | # Inside a '"' quoted string, '\\' only escapes the quote |
| 91 | # character and backslash, otherwise it is preserved. |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 92 | if self.pos == self.end: |
| 93 | Util.warning("escape at end of quoted argument in: %r" % |
| 94 | self.data) |
| 95 | return str |
| 96 | c = self.eat() |
Daniel Dunbar | ee41c4d | 2009-08-01 05:52:04 +0000 | [diff] [blame] | 97 | if c == '"': # |
| 98 | str += '"' |
| 99 | elif c == '\\': |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 100 | str += '\\' |
Daniel Dunbar | ee41c4d | 2009-08-01 05:52:04 +0000 | [diff] [blame] | 101 | else: |
| 102 | str += '\\' + c |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 103 | else: |
| 104 | str += c |
| 105 | Util.warning("missing quote character in %r" % self.data) |
| 106 | return str |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 107 | |
| 108 | def lex_arg_checked(self, c): |
| 109 | pos = self.pos |
| 110 | res = self.lex_arg_fast(c) |
| 111 | end = self.pos |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 112 | |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 113 | self.pos = pos |
| 114 | reference = self.lex_arg_slow(c) |
| 115 | if res is not None: |
| 116 | if res != reference: |
| 117 | raise ValueError,"Fast path failure: %r != %r" % (res, reference) |
| 118 | if self.pos != end: |
| 119 | raise ValueError,"Fast path failure: %r != %r" % (self.pos, end) |
| 120 | return reference |
| 121 | |
| 122 | def lex_arg(self, c): |
| 123 | return self.lex_arg_fast(c) or self.lex_arg_slow(c) |
| 124 | |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 125 | def lex_one_token(self): |
| 126 | """ |
| 127 | lex_one_token - Lex a single 'sh' token. """ |
| 128 | |
| 129 | c = self.eat() |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 130 | if c in ';!': |
| 131 | return (c,) |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 132 | if c == '|': |
| 133 | if self.maybe_eat('|'): |
| 134 | return ('||',) |
| 135 | return (c,) |
| 136 | if c == '&': |
| 137 | if self.maybe_eat('&'): |
| 138 | return ('&&',) |
| 139 | if self.maybe_eat('>'): |
| 140 | return ('&>',) |
| 141 | return (c,) |
| 142 | if c == '>': |
| 143 | if self.maybe_eat('&'): |
| 144 | return ('>&',) |
| 145 | if self.maybe_eat('>'): |
| 146 | return ('>>',) |
| 147 | return (c,) |
| 148 | if c == '<': |
| 149 | if self.maybe_eat('&'): |
| 150 | return ('<&',) |
| 151 | if self.maybe_eat('>'): |
| 152 | return ('<<',) |
Daniel Dunbar | ee41c4d | 2009-08-01 05:52:04 +0000 | [diff] [blame] | 153 | return (c,) |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 154 | |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 155 | return self.lex_arg(c) |
| 156 | |
| 157 | def lex(self): |
| 158 | while self.pos != self.end: |
| 159 | if self.look().isspace(): |
| 160 | self.eat() |
| 161 | else: |
| 162 | yield self.lex_one_token() |
| 163 | |
| 164 | ### |
| 165 | |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 166 | class Command: |
| 167 | def __init__(self, args, redirects): |
| 168 | self.args = list(args) |
| 169 | self.redirects = list(redirects) |
| 170 | |
| 171 | def __repr__(self): |
| 172 | return 'Command(%r, %r)' % (self.args, self.redirects) |
| 173 | |
| 174 | def __cmp__(self, other): |
| 175 | if not isinstance(other, Command): |
| 176 | return -1 |
| 177 | |
| 178 | return cmp((self.args, self.redirects), |
| 179 | (other.args, other.redirects)) |
| 180 | |
| 181 | class Pipeline: |
| 182 | def __init__(self, commands, negate): |
| 183 | self.commands = commands |
| 184 | self.negate = negate |
| 185 | |
| 186 | def __repr__(self): |
| 187 | return 'Pipeline(%r, %r)' % (self.commands, self.negate) |
| 188 | |
| 189 | def __cmp__(self, other): |
| 190 | if not isinstance(other, Pipeline): |
| 191 | return -1 |
| 192 | |
| 193 | return cmp((self.commands, self.negate), |
| 194 | (other.commands, other.negate)) |
| 195 | |
| 196 | class Seq: |
| 197 | def __init__(self, lhs, op, rhs): |
| 198 | assert op in (';', '&', '||', '&&') |
| 199 | self.op = op |
| 200 | self.lhs = lhs |
| 201 | self.rhs = rhs |
| 202 | |
| 203 | def __repr__(self): |
| 204 | return 'Seq(%r, %r, %r)' % (self.lhs, self.op, self.rhs) |
| 205 | |
| 206 | def __cmp__(self, other): |
| 207 | if not isinstance(other, Seq): |
| 208 | return -1 |
| 209 | |
| 210 | return cmp((self.lhs, self.op, self.rhs), |
| 211 | (other.lhs, other.op, other.rhs)) |
| 212 | |
| 213 | class ShParser: |
| 214 | def __init__(self, data): |
| 215 | self.data = data |
| 216 | self.tokens = ShLexer(data).lex() |
| 217 | |
| 218 | def lex(self): |
| 219 | try: |
| 220 | return self.tokens.next() |
| 221 | except StopIteration: |
| 222 | return None |
| 223 | |
| 224 | def look(self): |
| 225 | next = self.lex() |
Daniel Dunbar | a39be6a | 2009-08-01 09:41:09 +0000 | [diff] [blame] | 226 | if next is not None: |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 227 | self.tokens = itertools.chain([next], self.tokens) |
| 228 | return next |
| 229 | |
| 230 | def parse_command(self): |
| 231 | tok = self.lex() |
| 232 | if not tok: |
| 233 | raise ValueError,"empty command!" |
| 234 | if isinstance(tok, tuple): |
| 235 | raise ValueError,"syntax error near unexpected token %r" % tok[0] |
| 236 | |
| 237 | args = [tok] |
| 238 | redirects = [] |
| 239 | while 1: |
| 240 | tok = self.look() |
| 241 | |
| 242 | # EOF? |
| 243 | if tok is None: |
| 244 | break |
| 245 | |
| 246 | # If this is an argument, just add it to the current command. |
| 247 | if isinstance(tok, str): |
| 248 | args.append(self.lex()) |
| 249 | continue |
| 250 | |
| 251 | # Otherwise see if it is a terminator. |
| 252 | assert isinstance(tok, tuple) |
| 253 | if tok[0] in ('|',';','&','||','&&'): |
| 254 | break |
| 255 | |
| 256 | # Otherwise it must be a redirection. |
| 257 | op = self.lex() |
| 258 | arg = self.lex() |
| 259 | if not arg: |
| 260 | raise ValueError,"syntax error near token %r" % op[0] |
| 261 | redirects.append((op, arg)) |
| 262 | |
| 263 | return Command(args, redirects) |
| 264 | |
| 265 | def parse_pipeline(self): |
| 266 | negate = False |
| 267 | if self.look() == ('!',): |
| 268 | self.lex() |
| 269 | negate = True |
| 270 | |
| 271 | commands = [self.parse_command()] |
| 272 | while self.look() == ('|',): |
| 273 | self.lex() |
| 274 | commands.append(self.parse_command()) |
| 275 | return Pipeline(commands, negate) |
| 276 | |
| 277 | def parse(self): |
| 278 | lhs = self.parse_pipeline() |
| 279 | |
| 280 | while self.look(): |
| 281 | operator = self.lex() |
| 282 | assert isinstance(operator, tuple) and len(operator) == 1 |
| 283 | |
| 284 | if not self.look(): |
| 285 | raise ValueError, "missing argument to operator %r" % operator[0] |
| 286 | |
| 287 | # FIXME: Operator precedence!! |
| 288 | lhs = Seq(lhs, operator[0], self.parse_pipeline()) |
| 289 | |
| 290 | return lhs |
| 291 | |
| 292 | ### |
| 293 | |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 294 | import unittest |
| 295 | |
| 296 | class TestShLexer(unittest.TestCase): |
| 297 | def lex(self, str): |
| 298 | return list(ShLexer(str).lex()) |
| 299 | |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 300 | def test_basic(self): |
Daniel Dunbar | ee41c4d | 2009-08-01 05:52:04 +0000 | [diff] [blame] | 301 | self.assertEqual(self.lex('a|b>c&d<e'), |
| 302 | ['a', ('|',), 'b', ('>',), 'c', ('&',), 'd', |
| 303 | ('<',), 'e']) |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 304 | |
| 305 | def test_redirection_tokens(self): |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 306 | self.assertEqual(self.lex('a2>c'), |
| 307 | ['a2', ('>',), 'c']) |
| 308 | self.assertEqual(self.lex('a 2>c'), |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 309 | ['a', ('>',2), 'c']) |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 310 | |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 311 | def test_quoting(self): |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 312 | self.assertEqual(self.lex(""" 'a' """), |
| 313 | ['a']) |
| 314 | self.assertEqual(self.lex(""" "hello\\"world" """), |
| 315 | ['hello"world']) |
| 316 | self.assertEqual(self.lex(""" "hello\\'world" """), |
| 317 | ["hello\\'world"]) |
Daniel Dunbar | ee41c4d | 2009-08-01 05:52:04 +0000 | [diff] [blame] | 318 | self.assertEqual(self.lex(""" "hello\\\\world" """), |
| 319 | ["hello\\world"]) |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 320 | self.assertEqual(self.lex(""" he"llo wo"rld """), |
| 321 | ["hello world"]) |
Daniel Dunbar | a39be6a | 2009-08-01 09:41:09 +0000 | [diff] [blame] | 322 | self.assertEqual(self.lex(""" a\\ b a\\\\b """), |
| 323 | ["a b", "a\\b"]) |
| 324 | self.assertEqual(self.lex(""" "" "" """), |
| 325 | ["", ""]) |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 326 | |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 327 | class TestShParse(unittest.TestCase): |
| 328 | def parse(self, str): |
| 329 | return ShParser(str).parse() |
| 330 | |
| 331 | def test_basic(self): |
| 332 | self.assertEqual(self.parse('echo hello'), |
| 333 | Pipeline([Command(['echo', 'hello'], [])], False)) |
Daniel Dunbar | a39be6a | 2009-08-01 09:41:09 +0000 | [diff] [blame] | 334 | self.assertEqual(self.parse('echo ""'), |
| 335 | Pipeline([Command(['echo', ''], [])], False)) |
Daniel Dunbar | 93fe03f | 2009-08-01 03:22:27 +0000 | [diff] [blame] | 336 | |
| 337 | def test_redirection(self): |
| 338 | self.assertEqual(self.parse('echo hello > c'), |
| 339 | Pipeline([Command(['echo', 'hello'], |
| 340 | [((('>'),), 'c')])], False)) |
| 341 | self.assertEqual(self.parse('echo hello > c >> d'), |
| 342 | Pipeline([Command(['echo', 'hello'], [(('>',), 'c'), |
| 343 | (('>>',), 'd')])], False)) |
| 344 | |
| 345 | def test_pipeline(self): |
| 346 | self.assertEqual(self.parse('a | b'), |
| 347 | Pipeline([Command(['a'], []), |
| 348 | Command(['b'], [])], |
| 349 | False)) |
| 350 | |
| 351 | self.assertEqual(self.parse('a | b | c'), |
| 352 | Pipeline([Command(['a'], []), |
| 353 | Command(['b'], []), |
| 354 | Command(['c'], [])], |
| 355 | False)) |
| 356 | |
| 357 | self.assertEqual(self.parse('! a'), |
| 358 | Pipeline([Command(['a'], [])], |
| 359 | True)) |
| 360 | |
| 361 | def test_list(self): |
| 362 | self.assertEqual(self.parse('a ; b'), |
| 363 | Seq(Pipeline([Command(['a'], [])], False), |
| 364 | ';', |
| 365 | Pipeline([Command(['b'], [])], False))) |
| 366 | |
| 367 | self.assertEqual(self.parse('a & b'), |
| 368 | Seq(Pipeline([Command(['a'], [])], False), |
| 369 | '&', |
| 370 | Pipeline([Command(['b'], [])], False))) |
| 371 | |
| 372 | self.assertEqual(self.parse('a && b'), |
| 373 | Seq(Pipeline([Command(['a'], [])], False), |
| 374 | '&&', |
| 375 | Pipeline([Command(['b'], [])], False))) |
| 376 | |
| 377 | self.assertEqual(self.parse('a || b'), |
| 378 | Seq(Pipeline([Command(['a'], [])], False), |
| 379 | '||', |
| 380 | Pipeline([Command(['b'], [])], False))) |
| 381 | |
| 382 | self.assertEqual(self.parse('a && b || c'), |
| 383 | Seq(Seq(Pipeline([Command(['a'], [])], False), |
| 384 | '&&', |
| 385 | Pipeline([Command(['b'], [])], False)), |
| 386 | '||', |
| 387 | Pipeline([Command(['c'], [])], False))) |
Daniel Dunbar | a39be6a | 2009-08-01 09:41:09 +0000 | [diff] [blame] | 388 | |
Daniel Dunbar | 7b90be7 | 2009-07-31 07:59:05 +0000 | [diff] [blame] | 389 | if __name__ == '__main__': |
| 390 | unittest.main() |