blob: 3edd3db1ed9ed445cba637570e6a04a2652765b3 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""A lexical analyzer class for simple shell-like syntaxes."""
2
Tim Peters70c43782001-01-17 08:48:39 +00003# Module and documentation by Eric S. Raymond, 21 Dec 1998
Guido van Rossumeb4e11a2000-05-01 20:08:46 +00004# Input stacking and error message cleanup added by ESR, March 2000
Tim Peters70c43782001-01-17 08:48:39 +00005# push_source() and pop_source() made explicit by ESR, January 2001.
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +00006# Posix compliance, split(), string arguments, and
7# iterator interface by Gustavo Niemeyer, April 2003.
Guido van Rossum9c30c241998-12-22 05:19:29 +00008
Fred Drake52dc76c2000-07-03 09:56:23 +00009import os.path
Guido van Rossum73898c71999-05-03 18:14:16 +000010import sys
Raymond Hettinger756b3f32004-01-29 06:37:52 +000011from collections import deque
Guido van Rossum9c30c241998-12-22 05:19:29 +000012
Guido van Rossum68937b42007-05-18 00:51:22 +000013from io import StringIO
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +000014
15__all__ = ["shlex", "split"]
Skip Montanaro0de65802001-02-15 22:15:14 +000016
Guido van Rossum9c30c241998-12-22 05:19:29 +000017class shlex:
Tim Peters70c43782001-01-17 08:48:39 +000018 "A lexical analyzer class for simple shell-like syntaxes."
Fred Drake24315232003-04-17 22:01:17 +000019 def __init__(self, instream=None, infile=None, posix=False):
Guido van Rossum3172c5d2007-10-16 18:12:55 +000020 if isinstance(instream, str):
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +000021 instream = StringIO(instream)
Raymond Hettingerf13eb552002-06-02 00:40:05 +000022 if instream is not None:
Guido van Rossum9c30c241998-12-22 05:19:29 +000023 self.instream = instream
Guido van Rossumeb4e11a2000-05-01 20:08:46 +000024 self.infile = infile
Guido van Rossum9c30c241998-12-22 05:19:29 +000025 else:
26 self.instream = sys.stdin
Guido van Rossumeb4e11a2000-05-01 20:08:46 +000027 self.infile = None
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +000028 self.posix = posix
29 if posix:
30 self.eof = None
31 else:
32 self.eof = ''
Guido van Rossum9c30c241998-12-22 05:19:29 +000033 self.commenters = '#'
Fred Drakedbbf76b2000-07-09 16:44:26 +000034 self.wordchars = ('abcdfeghijklmnopqrstuvwxyz'
35 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_')
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +000036 if self.posix:
Antoine Pitroud72402e2010-10-27 18:52:48 +000037 self.wordchars += ('ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'
38 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ')
Guido van Rossum9c30c241998-12-22 05:19:29 +000039 self.whitespace = ' \t\r\n'
Fred Drake24315232003-04-17 22:01:17 +000040 self.whitespace_split = False
Guido van Rossum9c30c241998-12-22 05:19:29 +000041 self.quotes = '\'"'
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +000042 self.escape = '\\'
43 self.escapedquotes = '"'
Guido van Rossum9c30c241998-12-22 05:19:29 +000044 self.state = ' '
Raymond Hettinger756b3f32004-01-29 06:37:52 +000045 self.pushback = deque()
Guido van Rossum9c30c241998-12-22 05:19:29 +000046 self.lineno = 1
47 self.debug = 0
48 self.token = ''
Raymond Hettinger756b3f32004-01-29 06:37:52 +000049 self.filestack = deque()
Guido van Rossumeb4e11a2000-05-01 20:08:46 +000050 self.source = None
51 if self.debug:
Guido van Rossumbe19ed72007-02-09 05:37:30 +000052 print('shlex: reading from %s, line %d' \
53 % (self.instream, self.lineno))
Guido van Rossum9c30c241998-12-22 05:19:29 +000054
55 def push_token(self, tok):
56 "Push a token onto the stack popped by the get_token method"
Guido van Rossumeb4e11a2000-05-01 20:08:46 +000057 if self.debug >= 1:
Guido van Rossumbe19ed72007-02-09 05:37:30 +000058 print("shlex: pushing token " + repr(tok))
Raymond Hettinger756b3f32004-01-29 06:37:52 +000059 self.pushback.appendleft(tok)
Guido van Rossum9c30c241998-12-22 05:19:29 +000060
Eric S. Raymondbddbaf72001-01-16 15:19:13 +000061 def push_source(self, newstream, newfile=None):
62 "Push an input source onto the lexer's input source stack."
Guido van Rossum3172c5d2007-10-16 18:12:55 +000063 if isinstance(newstream, str):
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +000064 newstream = StringIO(newstream)
Raymond Hettinger756b3f32004-01-29 06:37:52 +000065 self.filestack.appendleft((self.infile, self.instream, self.lineno))
Eric S. Raymondbddbaf72001-01-16 15:19:13 +000066 self.infile = newfile
67 self.instream = newstream
68 self.lineno = 1
69 if self.debug:
Raymond Hettingerf13eb552002-06-02 00:40:05 +000070 if newfile is not None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +000071 print('shlex: pushing to file %s' % (self.infile,))
Eric S. Raymondbddbaf72001-01-16 15:19:13 +000072 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +000073 print('shlex: pushing to stream %s' % (self.instream,))
Eric S. Raymondbddbaf72001-01-16 15:19:13 +000074
75 def pop_source(self):
76 "Pop the input source stack."
77 self.instream.close()
Raymond Hettinger756b3f32004-01-29 06:37:52 +000078 (self.infile, self.instream, self.lineno) = self.filestack.popleft()
Eric S. Raymondbddbaf72001-01-16 15:19:13 +000079 if self.debug:
Guido van Rossumbe19ed72007-02-09 05:37:30 +000080 print('shlex: popping to %s, line %d' \
81 % (self.instream, self.lineno))
Eric S. Raymondbddbaf72001-01-16 15:19:13 +000082 self.state = ' '
83
Guido van Rossum9c30c241998-12-22 05:19:29 +000084 def get_token(self):
Guido van Rossumeb4e11a2000-05-01 20:08:46 +000085 "Get a token from the input stream (or from stack if it's nonempty)"
Guido van Rossum9c30c241998-12-22 05:19:29 +000086 if self.pushback:
Raymond Hettinger756b3f32004-01-29 06:37:52 +000087 tok = self.pushback.popleft()
Guido van Rossumeb4e11a2000-05-01 20:08:46 +000088 if self.debug >= 1:
Guido van Rossumbe19ed72007-02-09 05:37:30 +000089 print("shlex: popping token " + repr(tok))
Guido van Rossum9c30c241998-12-22 05:19:29 +000090 return tok
Fred Drakedbbf76b2000-07-09 16:44:26 +000091 # No pushback. Get a token.
Guido van Rossumeb4e11a2000-05-01 20:08:46 +000092 raw = self.read_token()
93 # Handle inclusions
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +000094 if self.source is not None:
95 while raw == self.source:
96 spec = self.sourcehook(self.read_token())
97 if spec:
98 (newfile, newstream) = spec
99 self.push_source(newstream, newfile)
100 raw = self.get_token()
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000101 # Maybe we got EOF instead?
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000102 while raw == self.eof:
Fred Drake24315232003-04-17 22:01:17 +0000103 if not self.filestack:
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000104 return self.eof
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000105 else:
Eric S. Raymondbddbaf72001-01-16 15:19:13 +0000106 self.pop_source()
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000107 raw = self.get_token()
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000108 # Neither inclusion nor EOF
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000109 if self.debug >= 1:
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000110 if raw != self.eof:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000111 print("shlex: token=" + repr(raw))
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000112 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000113 print("shlex: token=EOF")
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000114 return raw
115
116 def read_token(self):
Fred Drake24315232003-04-17 22:01:17 +0000117 quoted = False
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000118 escapedstate = ' '
Neal Norwitz10cf2182003-04-17 23:09:08 +0000119 while True:
Andrew M. Kuchling49d27c82000-12-23 14:20:24 +0000120 nextchar = self.instream.read(1)
Guido van Rossum9c30c241998-12-22 05:19:29 +0000121 if nextchar == '\n':
122 self.lineno = self.lineno + 1
123 if self.debug >= 3:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000124 print("shlex: in state", repr(self.state), \
125 "I see character:", repr(nextchar))
Fred Drakedbbf76b2000-07-09 16:44:26 +0000126 if self.state is None:
Eric S. Raymondbddbaf72001-01-16 15:19:13 +0000127 self.token = '' # past end of file
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000128 break
Guido van Rossum9c30c241998-12-22 05:19:29 +0000129 elif self.state == ' ':
130 if not nextchar:
Eric S. Raymondbddbaf72001-01-16 15:19:13 +0000131 self.state = None # end of file
Guido van Rossum9c30c241998-12-22 05:19:29 +0000132 break
133 elif nextchar in self.whitespace:
134 if self.debug >= 2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000135 print("shlex: I see whitespace in whitespace state")
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000136 if self.token or (self.posix and quoted):
Fred Drakedbbf76b2000-07-09 16:44:26 +0000137 break # emit current token
Guido van Rossum9c30c241998-12-22 05:19:29 +0000138 else:
139 continue
140 elif nextchar in self.commenters:
141 self.instream.readline()
142 self.lineno = self.lineno + 1
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000143 elif self.posix and nextchar in self.escape:
144 escapedstate = 'a'
145 self.state = nextchar
Guido van Rossum9c30c241998-12-22 05:19:29 +0000146 elif nextchar in self.wordchars:
147 self.token = nextchar
148 self.state = 'a'
149 elif nextchar in self.quotes:
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000150 if not self.posix:
151 self.token = nextchar
Guido van Rossum9c30c241998-12-22 05:19:29 +0000152 self.state = nextchar
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000153 elif self.whitespace_split:
154 self.token = nextchar
155 self.state = 'a'
Guido van Rossum9c30c241998-12-22 05:19:29 +0000156 else:
157 self.token = nextchar
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000158 if self.token or (self.posix and quoted):
Fred Drakedbbf76b2000-07-09 16:44:26 +0000159 break # emit current token
Guido van Rossum9c30c241998-12-22 05:19:29 +0000160 else:
161 continue
162 elif self.state in self.quotes:
Fred Drake24315232003-04-17 22:01:17 +0000163 quoted = True
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000164 if not nextchar: # end of file
Andrew M. Kuchling9d56cd12001-01-09 03:01:15 +0000165 if self.debug >= 2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000166 print("shlex: I see EOF in quotes state")
Andrew M. Kuchling9d56cd12001-01-09 03:01:15 +0000167 # XXX what error should be raised here?
Collin Winterce36ad82007-08-30 01:19:48 +0000168 raise ValueError("No closing quotation")
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000169 if nextchar == self.state:
170 if not self.posix:
171 self.token = self.token + nextchar
172 self.state = ' '
173 break
174 else:
175 self.state = 'a'
176 elif self.posix and nextchar in self.escape and \
177 self.state in self.escapedquotes:
178 escapedstate = self.state
179 self.state = nextchar
180 else:
181 self.token = self.token + nextchar
182 elif self.state in self.escape:
183 if not nextchar: # end of file
184 if self.debug >= 2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000185 print("shlex: I see EOF in escape state")
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000186 # XXX what error should be raised here?
Collin Winterce36ad82007-08-30 01:19:48 +0000187 raise ValueError("No escaped character")
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000188 # In posix shells, only the quote itself or the escape
189 # character may be escaped within quotes.
190 if escapedstate in self.quotes and \
191 nextchar != self.state and nextchar != escapedstate:
192 self.token = self.token + self.state
193 self.token = self.token + nextchar
194 self.state = escapedstate
Guido van Rossum9c30c241998-12-22 05:19:29 +0000195 elif self.state == 'a':
196 if not nextchar:
Tim Peters70c43782001-01-17 08:48:39 +0000197 self.state = None # end of file
Guido van Rossum9c30c241998-12-22 05:19:29 +0000198 break
199 elif nextchar in self.whitespace:
200 if self.debug >= 2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000201 print("shlex: I see whitespace in word state")
Guido van Rossum9c30c241998-12-22 05:19:29 +0000202 self.state = ' '
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000203 if self.token or (self.posix and quoted):
Fred Drakedbbf76b2000-07-09 16:44:26 +0000204 break # emit current token
Guido van Rossum9c30c241998-12-22 05:19:29 +0000205 else:
206 continue
207 elif nextchar in self.commenters:
208 self.instream.readline()
209 self.lineno = self.lineno + 1
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000210 if self.posix:
211 self.state = ' '
212 if self.token or (self.posix and quoted):
213 break # emit current token
214 else:
215 continue
216 elif self.posix and nextchar in self.quotes:
217 self.state = nextchar
218 elif self.posix and nextchar in self.escape:
219 escapedstate = 'a'
220 self.state = nextchar
221 elif nextchar in self.wordchars or nextchar in self.quotes \
222 or self.whitespace_split:
Guido van Rossum9c30c241998-12-22 05:19:29 +0000223 self.token = self.token + nextchar
224 else:
Raymond Hettinger756b3f32004-01-29 06:37:52 +0000225 self.pushback.appendleft(nextchar)
Guido van Rossum9c30c241998-12-22 05:19:29 +0000226 if self.debug >= 2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000227 print("shlex: I see punctuation in word state")
Guido van Rossumf247d751999-03-22 15:28:08 +0000228 self.state = ' '
Guido van Rossum9c30c241998-12-22 05:19:29 +0000229 if self.token:
Fred Drakedbbf76b2000-07-09 16:44:26 +0000230 break # emit current token
Guido van Rossum9c30c241998-12-22 05:19:29 +0000231 else:
232 continue
Guido van Rossum9c30c241998-12-22 05:19:29 +0000233 result = self.token
234 self.token = ''
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000235 if self.posix and not quoted and result == '':
236 result = None
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000237 if self.debug > 1:
238 if result:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000239 print("shlex: raw token=" + repr(result))
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000240 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000241 print("shlex: raw token=EOF")
Guido van Rossum9c30c241998-12-22 05:19:29 +0000242 return result
243
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000244 def sourcehook(self, newfile):
245 "Hook called on a filename to be sourced."
246 if newfile[0] == '"':
247 newfile = newfile[1:-1]
Fred Drake52dc76c2000-07-03 09:56:23 +0000248 # This implements cpp-like semantics for relative-path inclusion.
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000249 if isinstance(self.infile, str) and not os.path.isabs(newfile):
Fred Drake52dc76c2000-07-03 09:56:23 +0000250 newfile = os.path.join(os.path.dirname(self.infile), newfile)
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000251 return (newfile, open(newfile, "r"))
252
Guido van Rossum4b83ecb2000-05-01 20:14:12 +0000253 def error_leader(self, infile=None, lineno=None):
254 "Emit a C-compiler-like, Emacs-friendly error-message leader."
Raymond Hettingerf13eb552002-06-02 00:40:05 +0000255 if infile is None:
Guido van Rossum4b83ecb2000-05-01 20:14:12 +0000256 infile = self.infile
Raymond Hettingerf13eb552002-06-02 00:40:05 +0000257 if lineno is None:
Guido van Rossum4b83ecb2000-05-01 20:14:12 +0000258 lineno = self.lineno
259 return "\"%s\", line %d: " % (infile, lineno)
260
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000261 def __iter__(self):
262 return self
263
Georg Brandla18af4e2007-04-21 15:47:16 +0000264 def __next__(self):
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000265 token = self.get_token()
266 if token == self.eof:
267 raise StopIteration
268 return token
269
Guido van Rossume7ba4952007-06-06 23:52:48 +0000270def split(s, comments=False, posix=True):
271 lex = shlex(s, posix=posix)
Gustavo Niemeyer48f3dcc2003-04-20 01:57:03 +0000272 lex.whitespace_split = True
273 if not comments:
274 lex.commenters = ''
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000275 return list(lex)
Guido van Rossum9c30c241998-12-22 05:19:29 +0000276
Tim Peters70c43782001-01-17 08:48:39 +0000277if __name__ == '__main__':
Fred Drake52dc76c2000-07-03 09:56:23 +0000278 if len(sys.argv) == 1:
279 lexer = shlex()
280 else:
281 file = sys.argv[1]
282 lexer = shlex(open(file), file)
Guido van Rossum9c30c241998-12-22 05:19:29 +0000283 while 1:
284 tt = lexer.get_token()
Fred Drake52dc76c2000-07-03 09:56:23 +0000285 if tt:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000286 print("Token: " + repr(tt))
Fred Drake52dc76c2000-07-03 09:56:23 +0000287 else:
Guido van Rossum9c30c241998-12-22 05:19:29 +0000288 break