blob: 2c9786c517a350f2bd766b539e41ac860f9dd435 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""A lexical analyzer class for simple shell-like syntaxes."""
2
Tim Peters70c43782001-01-17 08:48:39 +00003# Module and documentation by Eric S. Raymond, 21 Dec 1998
Guido van Rossumeb4e11a2000-05-01 20:08:46 +00004# Input stacking and error message cleanup added by ESR, March 2000
Tim Peters70c43782001-01-17 08:48:39 +00005# push_source() and pop_source() made explicit by ESR, January 2001.
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +00006# Posix compliance, split(), string arguments, and
7# iterator interface by Gustavo Niemeyer, April 2003.
Vinay Sajipc1f974c2016-07-29 22:35:03 +01008# changes to tokenize more like Posix shells by Vinay Sajip, July 2016.
Guido van Rossum9c30c241998-12-22 05:19:29 +00009
Éric Araujo9bce3112011-07-27 18:29:31 +020010import os
11import re
Guido van Rossum73898c71999-05-03 18:14:16 +000012import sys
Raymond Hettinger756b3f32004-01-29 06:37:52 +000013from collections import deque
Guido van Rossum9c30c241998-12-22 05:19:29 +000014
Guido van Rossum68937b42007-05-18 00:51:22 +000015from io import StringIO
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +000016
Éric Araujo9bce3112011-07-27 18:29:31 +020017__all__ = ["shlex", "split", "quote"]
Skip Montanaro0de65802001-02-15 22:15:14 +000018
Guido van Rossum9c30c241998-12-22 05:19:29 +000019class shlex:
Tim Peters70c43782001-01-17 08:48:39 +000020 "A lexical analyzer class for simple shell-like syntaxes."
Vinay Sajipc1f974c2016-07-29 22:35:03 +010021 def __init__(self, instream=None, infile=None, posix=False,
22 punctuation_chars=False):
Guido van Rossum3172c5d2007-10-16 18:12:55 +000023 if isinstance(instream, str):
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +000024 instream = StringIO(instream)
Raymond Hettingerf13eb552002-06-02 00:40:05 +000025 if instream is not None:
Guido van Rossum9c30c241998-12-22 05:19:29 +000026 self.instream = instream
Guido van Rossumeb4e11a2000-05-01 20:08:46 +000027 self.infile = infile
Guido van Rossum9c30c241998-12-22 05:19:29 +000028 else:
29 self.instream = sys.stdin
Guido van Rossumeb4e11a2000-05-01 20:08:46 +000030 self.infile = None
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +000031 self.posix = posix
32 if posix:
33 self.eof = None
34 else:
35 self.eof = ''
Guido van Rossum9c30c241998-12-22 05:19:29 +000036 self.commenters = '#'
Fred Drakedbbf76b2000-07-09 16:44:26 +000037 self.wordchars = ('abcdfeghijklmnopqrstuvwxyz'
38 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_')
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +000039 if self.posix:
Antoine Pitroud72402e2010-10-27 18:52:48 +000040 self.wordchars += ('ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'
41 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ')
Guido van Rossum9c30c241998-12-22 05:19:29 +000042 self.whitespace = ' \t\r\n'
Fred Drake24315232003-04-17 22:01:17 +000043 self.whitespace_split = False
Guido van Rossum9c30c241998-12-22 05:19:29 +000044 self.quotes = '\'"'
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +000045 self.escape = '\\'
46 self.escapedquotes = '"'
Guido van Rossum9c30c241998-12-22 05:19:29 +000047 self.state = ' '
Raymond Hettinger756b3f32004-01-29 06:37:52 +000048 self.pushback = deque()
Guido van Rossum9c30c241998-12-22 05:19:29 +000049 self.lineno = 1
50 self.debug = 0
51 self.token = ''
Raymond Hettinger756b3f32004-01-29 06:37:52 +000052 self.filestack = deque()
Guido van Rossumeb4e11a2000-05-01 20:08:46 +000053 self.source = None
Vinay Sajipc1f974c2016-07-29 22:35:03 +010054 if not punctuation_chars:
55 punctuation_chars = ''
56 elif punctuation_chars is True:
57 punctuation_chars = '();<>|&'
58 self.punctuation_chars = punctuation_chars
59 if punctuation_chars:
60 # _pushback_chars is a push back queue used by lookahead logic
61 self._pushback_chars = deque()
62 # these chars added because allowed in file names, args, wildcards
63 self.wordchars += '~-./*?='
64 #remove any punctuation chars from wordchars
65 t = self.wordchars.maketrans(dict.fromkeys(punctuation_chars))
66 self.wordchars = self.wordchars.translate(t)
Guido van Rossum9c30c241998-12-22 05:19:29 +000067
68 def push_token(self, tok):
69 "Push a token onto the stack popped by the get_token method"
Guido van Rossumeb4e11a2000-05-01 20:08:46 +000070 if self.debug >= 1:
Guido van Rossumbe19ed72007-02-09 05:37:30 +000071 print("shlex: pushing token " + repr(tok))
Raymond Hettinger756b3f32004-01-29 06:37:52 +000072 self.pushback.appendleft(tok)
Guido van Rossum9c30c241998-12-22 05:19:29 +000073
Eric S. Raymondbddbaf72001-01-16 15:19:13 +000074 def push_source(self, newstream, newfile=None):
75 "Push an input source onto the lexer's input source stack."
Guido van Rossum3172c5d2007-10-16 18:12:55 +000076 if isinstance(newstream, str):
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +000077 newstream = StringIO(newstream)
Raymond Hettinger756b3f32004-01-29 06:37:52 +000078 self.filestack.appendleft((self.infile, self.instream, self.lineno))
Eric S. Raymondbddbaf72001-01-16 15:19:13 +000079 self.infile = newfile
80 self.instream = newstream
81 self.lineno = 1
82 if self.debug:
Raymond Hettingerf13eb552002-06-02 00:40:05 +000083 if newfile is not None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +000084 print('shlex: pushing to file %s' % (self.infile,))
Eric S. Raymondbddbaf72001-01-16 15:19:13 +000085 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +000086 print('shlex: pushing to stream %s' % (self.instream,))
Eric S. Raymondbddbaf72001-01-16 15:19:13 +000087
88 def pop_source(self):
89 "Pop the input source stack."
90 self.instream.close()
Raymond Hettinger756b3f32004-01-29 06:37:52 +000091 (self.infile, self.instream, self.lineno) = self.filestack.popleft()
Eric S. Raymondbddbaf72001-01-16 15:19:13 +000092 if self.debug:
Guido van Rossumbe19ed72007-02-09 05:37:30 +000093 print('shlex: popping to %s, line %d' \
94 % (self.instream, self.lineno))
Eric S. Raymondbddbaf72001-01-16 15:19:13 +000095 self.state = ' '
96
Guido van Rossum9c30c241998-12-22 05:19:29 +000097 def get_token(self):
Guido van Rossumeb4e11a2000-05-01 20:08:46 +000098 "Get a token from the input stream (or from stack if it's nonempty)"
Guido van Rossum9c30c241998-12-22 05:19:29 +000099 if self.pushback:
Raymond Hettinger756b3f32004-01-29 06:37:52 +0000100 tok = self.pushback.popleft()
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000101 if self.debug >= 1:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000102 print("shlex: popping token " + repr(tok))
Guido van Rossum9c30c241998-12-22 05:19:29 +0000103 return tok
Fred Drakedbbf76b2000-07-09 16:44:26 +0000104 # No pushback. Get a token.
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000105 raw = self.read_token()
106 # Handle inclusions
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000107 if self.source is not None:
108 while raw == self.source:
109 spec = self.sourcehook(self.read_token())
110 if spec:
111 (newfile, newstream) = spec
112 self.push_source(newstream, newfile)
113 raw = self.get_token()
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000114 # Maybe we got EOF instead?
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000115 while raw == self.eof:
Fred Drake24315232003-04-17 22:01:17 +0000116 if not self.filestack:
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000117 return self.eof
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000118 else:
Eric S. Raymondbddbaf72001-01-16 15:19:13 +0000119 self.pop_source()
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000120 raw = self.get_token()
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000121 # Neither inclusion nor EOF
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000122 if self.debug >= 1:
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000123 if raw != self.eof:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000124 print("shlex: token=" + repr(raw))
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000125 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000126 print("shlex: token=EOF")
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000127 return raw
128
129 def read_token(self):
Fred Drake24315232003-04-17 22:01:17 +0000130 quoted = False
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000131 escapedstate = ' '
Neal Norwitz10cf2182003-04-17 23:09:08 +0000132 while True:
Vinay Sajipc1f974c2016-07-29 22:35:03 +0100133 if self.punctuation_chars and self._pushback_chars:
134 nextchar = self._pushback_chars.pop()
135 else:
136 nextchar = self.instream.read(1)
Guido van Rossum9c30c241998-12-22 05:19:29 +0000137 if nextchar == '\n':
Vinay Sajipc1f974c2016-07-29 22:35:03 +0100138 self.lineno += 1
Guido van Rossum9c30c241998-12-22 05:19:29 +0000139 if self.debug >= 3:
Vinay Sajipc1f974c2016-07-29 22:35:03 +0100140 print("shlex: in state %r I see character: %r" % (self.state,
141 nextchar))
Fred Drakedbbf76b2000-07-09 16:44:26 +0000142 if self.state is None:
Eric S. Raymondbddbaf72001-01-16 15:19:13 +0000143 self.token = '' # past end of file
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000144 break
Guido van Rossum9c30c241998-12-22 05:19:29 +0000145 elif self.state == ' ':
146 if not nextchar:
Eric S. Raymondbddbaf72001-01-16 15:19:13 +0000147 self.state = None # end of file
Guido van Rossum9c30c241998-12-22 05:19:29 +0000148 break
149 elif nextchar in self.whitespace:
150 if self.debug >= 2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000151 print("shlex: I see whitespace in whitespace state")
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000152 if self.token or (self.posix and quoted):
Fred Drakedbbf76b2000-07-09 16:44:26 +0000153 break # emit current token
Guido van Rossum9c30c241998-12-22 05:19:29 +0000154 else:
155 continue
156 elif nextchar in self.commenters:
157 self.instream.readline()
Vinay Sajipc1f974c2016-07-29 22:35:03 +0100158 self.lineno += 1
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000159 elif self.posix and nextchar in self.escape:
160 escapedstate = 'a'
161 self.state = nextchar
Guido van Rossum9c30c241998-12-22 05:19:29 +0000162 elif nextchar in self.wordchars:
163 self.token = nextchar
164 self.state = 'a'
Vinay Sajipc1f974c2016-07-29 22:35:03 +0100165 elif nextchar in self.punctuation_chars:
166 self.token = nextchar
167 self.state = 'c'
Guido van Rossum9c30c241998-12-22 05:19:29 +0000168 elif nextchar in self.quotes:
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000169 if not self.posix:
170 self.token = nextchar
Guido van Rossum9c30c241998-12-22 05:19:29 +0000171 self.state = nextchar
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000172 elif self.whitespace_split:
173 self.token = nextchar
174 self.state = 'a'
Guido van Rossum9c30c241998-12-22 05:19:29 +0000175 else:
176 self.token = nextchar
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000177 if self.token or (self.posix and quoted):
Fred Drakedbbf76b2000-07-09 16:44:26 +0000178 break # emit current token
Guido van Rossum9c30c241998-12-22 05:19:29 +0000179 else:
180 continue
181 elif self.state in self.quotes:
Fred Drake24315232003-04-17 22:01:17 +0000182 quoted = True
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000183 if not nextchar: # end of file
Andrew M. Kuchling9d56cd12001-01-09 03:01:15 +0000184 if self.debug >= 2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000185 print("shlex: I see EOF in quotes state")
Andrew M. Kuchling9d56cd12001-01-09 03:01:15 +0000186 # XXX what error should be raised here?
Collin Winterce36ad82007-08-30 01:19:48 +0000187 raise ValueError("No closing quotation")
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000188 if nextchar == self.state:
189 if not self.posix:
Vinay Sajipc1f974c2016-07-29 22:35:03 +0100190 self.token += nextchar
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000191 self.state = ' '
192 break
193 else:
194 self.state = 'a'
Vinay Sajipc1f974c2016-07-29 22:35:03 +0100195 elif (self.posix and nextchar in self.escape and self.state
196 in self.escapedquotes):
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000197 escapedstate = self.state
198 self.state = nextchar
199 else:
Vinay Sajipc1f974c2016-07-29 22:35:03 +0100200 self.token += nextchar
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000201 elif self.state in self.escape:
202 if not nextchar: # end of file
203 if self.debug >= 2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000204 print("shlex: I see EOF in escape state")
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000205 # XXX what error should be raised here?
Collin Winterce36ad82007-08-30 01:19:48 +0000206 raise ValueError("No escaped character")
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000207 # In posix shells, only the quote itself or the escape
208 # character may be escaped within quotes.
Vinay Sajipc1f974c2016-07-29 22:35:03 +0100209 if (escapedstate in self.quotes and
210 nextchar != self.state and nextchar != escapedstate):
211 self.token += self.state
212 self.token += nextchar
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000213 self.state = escapedstate
Vinay Sajipc1f974c2016-07-29 22:35:03 +0100214 elif self.state in ('a', 'c'):
Guido van Rossum9c30c241998-12-22 05:19:29 +0000215 if not nextchar:
Tim Peters70c43782001-01-17 08:48:39 +0000216 self.state = None # end of file
Guido van Rossum9c30c241998-12-22 05:19:29 +0000217 break
218 elif nextchar in self.whitespace:
219 if self.debug >= 2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000220 print("shlex: I see whitespace in word state")
Guido van Rossum9c30c241998-12-22 05:19:29 +0000221 self.state = ' '
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000222 if self.token or (self.posix and quoted):
Fred Drakedbbf76b2000-07-09 16:44:26 +0000223 break # emit current token
Guido van Rossum9c30c241998-12-22 05:19:29 +0000224 else:
225 continue
226 elif nextchar in self.commenters:
227 self.instream.readline()
Vinay Sajipc1f974c2016-07-29 22:35:03 +0100228 self.lineno += 1
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000229 if self.posix:
230 self.state = ' '
231 if self.token or (self.posix and quoted):
232 break # emit current token
233 else:
234 continue
Vinay Sajipc1f974c2016-07-29 22:35:03 +0100235 elif self.state == 'c':
236 if nextchar in self.punctuation_chars:
237 self.token += nextchar
238 else:
239 if nextchar not in self.whitespace:
240 self._pushback_chars.append(nextchar)
241 self.state = ' '
242 break
Vinay Sajip61eda722017-01-15 10:06:52 +0000243 elif self.posix and nextchar in self.quotes:
244 self.state = nextchar
245 elif self.posix and nextchar in self.escape:
246 escapedstate = 'a'
247 self.state = nextchar
Vinay Sajipc1f974c2016-07-29 22:35:03 +0100248 elif (nextchar in self.wordchars or nextchar in self.quotes
249 or self.whitespace_split):
250 self.token += nextchar
Guido van Rossum9c30c241998-12-22 05:19:29 +0000251 else:
Vinay Sajipc1f974c2016-07-29 22:35:03 +0100252 if self.punctuation_chars:
253 self._pushback_chars.append(nextchar)
254 else:
255 self.pushback.appendleft(nextchar)
Guido van Rossum9c30c241998-12-22 05:19:29 +0000256 if self.debug >= 2:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000257 print("shlex: I see punctuation in word state")
Guido van Rossumf247d751999-03-22 15:28:08 +0000258 self.state = ' '
Vinay Sajipc1f974c2016-07-29 22:35:03 +0100259 if self.token or (self.posix and quoted):
Fred Drakedbbf76b2000-07-09 16:44:26 +0000260 break # emit current token
Guido van Rossum9c30c241998-12-22 05:19:29 +0000261 else:
262 continue
Guido van Rossum9c30c241998-12-22 05:19:29 +0000263 result = self.token
264 self.token = ''
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000265 if self.posix and not quoted and result == '':
266 result = None
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000267 if self.debug > 1:
268 if result:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000269 print("shlex: raw token=" + repr(result))
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000270 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000271 print("shlex: raw token=EOF")
Guido van Rossum9c30c241998-12-22 05:19:29 +0000272 return result
273
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000274 def sourcehook(self, newfile):
275 "Hook called on a filename to be sourced."
276 if newfile[0] == '"':
277 newfile = newfile[1:-1]
Fred Drake52dc76c2000-07-03 09:56:23 +0000278 # This implements cpp-like semantics for relative-path inclusion.
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000279 if isinstance(self.infile, str) and not os.path.isabs(newfile):
Fred Drake52dc76c2000-07-03 09:56:23 +0000280 newfile = os.path.join(os.path.dirname(self.infile), newfile)
Guido van Rossumeb4e11a2000-05-01 20:08:46 +0000281 return (newfile, open(newfile, "r"))
282
Guido van Rossum4b83ecb2000-05-01 20:14:12 +0000283 def error_leader(self, infile=None, lineno=None):
284 "Emit a C-compiler-like, Emacs-friendly error-message leader."
Raymond Hettingerf13eb552002-06-02 00:40:05 +0000285 if infile is None:
Guido van Rossum4b83ecb2000-05-01 20:14:12 +0000286 infile = self.infile
Raymond Hettingerf13eb552002-06-02 00:40:05 +0000287 if lineno is None:
Guido van Rossum4b83ecb2000-05-01 20:14:12 +0000288 lineno = self.lineno
289 return "\"%s\", line %d: " % (infile, lineno)
290
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000291 def __iter__(self):
292 return self
293
Georg Brandla18af4e2007-04-21 15:47:16 +0000294 def __next__(self):
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000295 token = self.get_token()
296 if token == self.eof:
297 raise StopIteration
298 return token
299
Guido van Rossume7ba4952007-06-06 23:52:48 +0000300def split(s, comments=False, posix=True):
301 lex = shlex(s, posix=posix)
Gustavo Niemeyer48f3dcc2003-04-20 01:57:03 +0000302 lex.whitespace_split = True
303 if not comments:
304 lex.commenters = ''
Gustavo Niemeyer68d8cef2003-04-17 21:31:33 +0000305 return list(lex)
Guido van Rossum9c30c241998-12-22 05:19:29 +0000306
Éric Araujo9bce3112011-07-27 18:29:31 +0200307
Ezio Melotti67321cc2011-08-16 19:03:41 +0300308_find_unsafe = re.compile(r'[^\w@%+=:,./-]', re.ASCII).search
Éric Araujo9bce3112011-07-27 18:29:31 +0200309
310def quote(s):
311 """Return a shell-escaped version of the string *s*."""
312 if not s:
313 return "''"
314 if _find_unsafe(s) is None:
315 return s
316
317 # use single quotes, and put single quotes into double quotes
318 # the string $'b is then quoted as '$'"'"'b'
319 return "'" + s.replace("'", "'\"'\"'") + "'"
320
321
R David Murray838f2c42014-10-17 20:28:47 -0400322def _print_tokens(lexer):
Guido van Rossum9c30c241998-12-22 05:19:29 +0000323 while 1:
324 tt = lexer.get_token()
R David Murray838f2c42014-10-17 20:28:47 -0400325 if not tt:
Guido van Rossum9c30c241998-12-22 05:19:29 +0000326 break
R David Murray838f2c42014-10-17 20:28:47 -0400327 print("Token: " + repr(tt))
328
329if __name__ == '__main__':
330 if len(sys.argv) == 1:
331 _print_tokens(shlex())
332 else:
333 fn = sys.argv[1]
334 with open(fn) as f:
335 _print_tokens(shlex(f, fn))