bpo-28595: Allow shlex whitespace_split with punctuation_chars (GH-2071)

commit: 56624a99a916fd27152d5b23364303acc0d707de [log] [tgz]
author: Evan <evanunderscore@gmail.com> Sun Jun 02 05:09:22 2019 +1000
committer: Vinay Sajip <vinay_sajip@yahoo.co.uk> Sat Jun 01 20:09:22 2019 +0100
tree: 469ecf27c685101302f1c9c365f394df174e68e9
parent: 2b843ac0ae745026ce39514573c5d075137bef65 [diff]
diff --git a/Lib/shlex.py b/Lib/shlex.py
index fb1130d..edea077 100644
--- a/Lib/shlex.py
+++ b/Lib/shlex.py

@@ -246,7 +246,8 @@
                     escapedstate = 'a'
                     self.state = nextchar
                 elif (nextchar in self.wordchars or nextchar in self.quotes
-                      or self.whitespace_split):
+                      or (self.whitespace_split and
+                          nextchar not in self.punctuation_chars)):
                     self.token += nextchar
                 else:
                     if self.punctuation_chars:

diff --git a/Lib/test/test_shlex.py b/Lib/test/test_shlex.py
index a432610..376c5e8 100644
--- a/Lib/test/test_shlex.py
+++ b/Lib/test/test_shlex.py

@@ -1,4 +1,5 @@
 import io
+import itertools
 import shlex
 import string
 import unittest
@@ -183,10 +184,12 @@
             src = ['echo hi %s echo bye' % delimiter,
                    'echo hi%secho bye' % delimiter]
             ref = ['echo', 'hi', delimiter, 'echo', 'bye']
-            for ss in src:
+            for ss, ws in itertools.product(src, (False, True)):
                 s = shlex.shlex(ss, punctuation_chars=True)
+                s.whitespace_split = ws
                 result = list(s)
-                self.assertEqual(ref, result, "While splitting '%s'" % ss)
+                self.assertEqual(ref, result,
+                                 "While splitting '%s' [ws=%s]" % (ss, ws))
 
     def testSyntaxSplitSemicolon(self):
         """Test handling of syntax splitting of ;"""
@@ -197,10 +200,12 @@
                    'echo hi%s echo bye' % delimiter,
                    'echo hi%secho bye' % delimiter]
             ref = ['echo', 'hi', delimiter, 'echo', 'bye']
-            for ss in src:
+            for ss, ws in itertools.product(src, (False, True)):
                 s = shlex.shlex(ss, punctuation_chars=True)
+                s.whitespace_split = ws
                 result = list(s)
-                self.assertEqual(ref, result, "While splitting '%s'" % ss)
+                self.assertEqual(ref, result,
+                                 "While splitting '%s' [ws=%s]" % (ss, ws))
 
     def testSyntaxSplitRedirect(self):
         """Test handling of syntax splitting of >"""
@@ -211,10 +216,11 @@
                    'echo hi%s out' % delimiter,
                    'echo hi%sout' % delimiter]
             ref = ['echo', 'hi', delimiter, 'out']
-            for ss in src:
+            for ss, ws in itertools.product(src, (False, True)):
                 s = shlex.shlex(ss, punctuation_chars=True)
                 result = list(s)
-                self.assertEqual(ref, result, "While splitting '%s'" % ss)
+                self.assertEqual(ref, result,
+                                 "While splitting '%s' [ws=%s]" % (ss, ws))
 
     def testSyntaxSplitParen(self):
         """Test handling of syntax splitting of ()"""
@@ -222,18 +228,25 @@
         src = ['( echo hi )',
                '(echo hi)']
         ref = ['(', 'echo', 'hi', ')']
-        for ss in src:
+        for ss, ws in itertools.product(src, (False, True)):
             s = shlex.shlex(ss, punctuation_chars=True)
+            s.whitespace_split = ws
             result = list(s)
-            self.assertEqual(ref, result, "While splitting '%s'" % ss)
+            self.assertEqual(ref, result,
+                             "While splitting '%s' [ws=%s]" % (ss, ws))
 
     def testSyntaxSplitCustom(self):
         """Test handling of syntax splitting with custom chars"""
+        ss = "~/a&&b-c --color=auto||d *.py?"
         ref = ['~/a', '&', '&', 'b-c', '--color=auto', '||', 'd', '*.py?']
-        ss = "~/a && b-c --color=auto || d *.py?"
         s = shlex.shlex(ss, punctuation_chars="|")
         result = list(s)
-        self.assertEqual(ref, result, "While splitting '%s'" % ss)
+        self.assertEqual(ref, result, "While splitting '%s' [ws=False]" % ss)
+        ref = ['~/a&&b-c', '--color=auto', '||', 'd', '*.py?']
+        s = shlex.shlex(ss, punctuation_chars="|")
+        s.whitespace_split = True
+        result = list(s)
+        self.assertEqual(ref, result, "While splitting '%s' [ws=True]" % ss)
 
     def testTokenTypes(self):
         """Test that tokens are split with types as expected."""
@@ -293,6 +306,19 @@
         s = shlex.shlex("'')abc", punctuation_chars=True)
         self.assertEqual(list(s), expected)
 
+    def testUnicodeHandling(self):
+        """Test punctuation_chars and whitespace_split handle unicode."""
+        ss = "\u2119\u01b4\u2602\u210c\u00f8\u1f24"
+        # Should be parsed as one complete token (whitespace_split=True).
+        ref = ['\u2119\u01b4\u2602\u210c\u00f8\u1f24']
+        s = shlex.shlex(ss, punctuation_chars=True)
+        s.whitespace_split = True
+        self.assertEqual(list(s), ref)
+        # Without whitespace_split, uses wordchars and splits on all.
+        ref = ['\u2119', '\u01b4', '\u2602', '\u210c', '\u00f8', '\u1f24']
+        s = shlex.shlex(ss, punctuation_chars=True)
+        self.assertEqual(list(s), ref)
+
     def testQuote(self):
         safeunquoted = string.ascii_letters + string.digits + '@%_-+=:,./'
         unicode_sample = '\xe9\xe0\xdf'  # e + acute accent, a + grave, sharp s
commit	56624a99a916fd27152d5b23364303acc0d707de	[log] [tgz]
author	Evan <evanunderscore@gmail.com>	Sun Jun 02 05:09:22 2019 +1000
committer	Vinay Sajip <vinay_sajip@yahoo.co.uk>	Sat Jun 01 20:09:22 2019 +0100
tree	469ecf27c685101302f1c9c365f394df174e68e9
parent	2b843ac0ae745026ce39514573c5d075137bef65 [diff]