blob: 5a81a5f11a4adb2dd8bcd7c1995d81bb16e0ef01 [file] [log] [blame]
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001from test import support
2from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
3 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
4 open as tokenize_open, Untokenizer)
5from io import BytesIO
Brett Cannona721aba2016-09-09 14:57:09 -07006from unittest import TestCase, mock
7from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
8 INVALID_UNDERSCORE_LITERALS)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03009import os
10import token
Thomas Wouters89f507f2006-12-13 04:49:30 +000011
Thomas Wouters89f507f2006-12-13 04:49:30 +000012
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030013class TokenizeTest(TestCase):
14 # Tests for the tokenize module.
Jason R. Coombs7cf36382015-06-20 19:13:50 -040015
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030016 # The tests can be really simple. Given a small fragment of source
17 # code, print out a table with tokens. The ENDMARKER is omitted for
18 # brevity.
19
20 def check_tokenize(self, s, expected):
21 # Format the tokens in s in a table format.
22 # The ENDMARKER is omitted.
23 result = []
24 f = BytesIO(s.encode('utf-8'))
25 for type, token, start, end, line in tokenize(f.readline):
26 if type == ENDMARKER:
27 break
28 type = tok_name[type]
Eric V. Smith67317742015-10-16 20:45:53 -040029 result.append(f" {type:10} {token!r:13} {start} {end}")
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030030 self.assertEqual(result,
31 [" ENCODING 'utf-8' (0, 0) (0, 0)"] +
32 expected.rstrip().splitlines())
33
34 def test_basic(self):
35 self.check_tokenize("1 + 1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000036 NUMBER '1' (1, 0) (1, 1)
37 OP '+' (1, 2) (1, 3)
38 NUMBER '1' (1, 4) (1, 5)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030039 """)
40 self.check_tokenize("if False:\n"
41 " # NL\n"
42 " True = False # NEWLINE\n", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000043 NAME 'if' (1, 0) (1, 2)
44 NAME 'False' (1, 3) (1, 8)
45 OP ':' (1, 8) (1, 9)
46 NEWLINE '\\n' (1, 9) (1, 10)
47 COMMENT '# NL' (2, 4) (2, 8)
48 NL '\\n' (2, 8) (2, 9)
49 INDENT ' ' (3, 0) (3, 4)
50 NAME 'True' (3, 4) (3, 8)
51 OP '=' (3, 9) (3, 10)
52 NAME 'False' (3, 11) (3, 16)
53 COMMENT '# NEWLINE' (3, 17) (3, 26)
54 NEWLINE '\\n' (3, 26) (3, 27)
55 DEDENT '' (4, 0) (4, 0)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030056 """)
57 indent_error_file = b"""\
58def k(x):
59 x += 2
60 x += 5
61"""
62 readline = BytesIO(indent_error_file).readline
63 with self.assertRaisesRegex(IndentationError,
64 "unindent does not match any "
65 "outer indentation level"):
66 for tok in tokenize(readline):
67 pass
Thomas Wouters89f507f2006-12-13 04:49:30 +000068
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030069 def test_int(self):
70 # Ordinary integers and binary operators
71 self.check_tokenize("0xff <= 255", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000072 NUMBER '0xff' (1, 0) (1, 4)
73 OP '<=' (1, 5) (1, 7)
74 NUMBER '255' (1, 8) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030075 """)
76 self.check_tokenize("0b10 <= 255", """\
Eric Smith74ca5572008-03-17 19:49:19 +000077 NUMBER '0b10' (1, 0) (1, 4)
78 OP '<=' (1, 5) (1, 7)
79 NUMBER '255' (1, 8) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030080 """)
81 self.check_tokenize("0o123 <= 0O123", """\
Eric Smith74ca5572008-03-17 19:49:19 +000082 NUMBER '0o123' (1, 0) (1, 5)
83 OP '<=' (1, 6) (1, 8)
84 NUMBER '0O123' (1, 9) (1, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030085 """)
86 self.check_tokenize("1234567 > ~0x15", """\
Mark Dickinson0c1f7c02008-03-16 05:05:12 +000087 NUMBER '1234567' (1, 0) (1, 7)
88 OP '>' (1, 8) (1, 9)
89 OP '~' (1, 10) (1, 11)
90 NUMBER '0x15' (1, 11) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030091 """)
92 self.check_tokenize("2134568 != 1231515", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000093 NUMBER '2134568' (1, 0) (1, 7)
94 OP '!=' (1, 8) (1, 10)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +000095 NUMBER '1231515' (1, 11) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030096 """)
97 self.check_tokenize("(-124561-1) & 200000000", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000098 OP '(' (1, 0) (1, 1)
99 OP '-' (1, 1) (1, 2)
100 NUMBER '124561' (1, 2) (1, 8)
101 OP '-' (1, 8) (1, 9)
102 NUMBER '1' (1, 9) (1, 10)
103 OP ')' (1, 10) (1, 11)
104 OP '&' (1, 12) (1, 13)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000105 NUMBER '200000000' (1, 14) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300106 """)
107 self.check_tokenize("0xdeadbeef != -1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000108 NUMBER '0xdeadbeef' (1, 0) (1, 10)
109 OP '!=' (1, 11) (1, 13)
110 OP '-' (1, 14) (1, 15)
111 NUMBER '1' (1, 15) (1, 16)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300112 """)
113 self.check_tokenize("0xdeadc0de & 12345", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000114 NUMBER '0xdeadc0de' (1, 0) (1, 10)
115 OP '&' (1, 11) (1, 12)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000116 NUMBER '12345' (1, 13) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300117 """)
118 self.check_tokenize("0xFF & 0x15 | 1234", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000119 NUMBER '0xFF' (1, 0) (1, 4)
120 OP '&' (1, 5) (1, 6)
121 NUMBER '0x15' (1, 7) (1, 11)
122 OP '|' (1, 12) (1, 13)
123 NUMBER '1234' (1, 14) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300124 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000125
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300126 def test_long(self):
127 # Long integers
128 self.check_tokenize("x = 0", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000129 NAME 'x' (1, 0) (1, 1)
130 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000131 NUMBER '0' (1, 4) (1, 5)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300132 """)
133 self.check_tokenize("x = 0xfffffffffff", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000134 NAME 'x' (1, 0) (1, 1)
135 OP '=' (1, 2) (1, 3)
Eric V. Smith67317742015-10-16 20:45:53 -0400136 NUMBER '0xfffffffffff' (1, 4) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300137 """)
138 self.check_tokenize("x = 123141242151251616110", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000139 NAME 'x' (1, 0) (1, 1)
140 OP '=' (1, 2) (1, 3)
Eric V. Smith67317742015-10-16 20:45:53 -0400141 NUMBER '123141242151251616110' (1, 4) (1, 25)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300142 """)
143 self.check_tokenize("x = -15921590215012591", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000144 NAME 'x' (1, 0) (1, 1)
145 OP '=' (1, 2) (1, 3)
146 OP '-' (1, 4) (1, 5)
Eric V. Smith67317742015-10-16 20:45:53 -0400147 NUMBER '15921590215012591' (1, 5) (1, 22)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300148 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000149
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300150 def test_float(self):
151 # Floating point numbers
152 self.check_tokenize("x = 3.14159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000153 NAME 'x' (1, 0) (1, 1)
154 OP '=' (1, 2) (1, 3)
155 NUMBER '3.14159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300156 """)
157 self.check_tokenize("x = 314159.", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000158 NAME 'x' (1, 0) (1, 1)
159 OP '=' (1, 2) (1, 3)
160 NUMBER '314159.' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300161 """)
162 self.check_tokenize("x = .314159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000163 NAME 'x' (1, 0) (1, 1)
164 OP '=' (1, 2) (1, 3)
165 NUMBER '.314159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300166 """)
167 self.check_tokenize("x = 3e14159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000168 NAME 'x' (1, 0) (1, 1)
169 OP '=' (1, 2) (1, 3)
170 NUMBER '3e14159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300171 """)
172 self.check_tokenize("x = 3E123", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000173 NAME 'x' (1, 0) (1, 1)
174 OP '=' (1, 2) (1, 3)
175 NUMBER '3E123' (1, 4) (1, 9)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300176 """)
177 self.check_tokenize("x+y = 3e-1230", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000178 NAME 'x' (1, 0) (1, 1)
179 OP '+' (1, 1) (1, 2)
180 NAME 'y' (1, 2) (1, 3)
181 OP '=' (1, 4) (1, 5)
182 NUMBER '3e-1230' (1, 6) (1, 13)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300183 """)
184 self.check_tokenize("x = 3.14e159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000185 NAME 'x' (1, 0) (1, 1)
186 OP '=' (1, 2) (1, 3)
187 NUMBER '3.14e159' (1, 4) (1, 12)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300188 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000189
Brett Cannona721aba2016-09-09 14:57:09 -0700190 def test_underscore_literals(self):
191 def number_token(s):
192 f = BytesIO(s.encode('utf-8'))
193 for toktype, token, start, end, line in tokenize(f.readline):
194 if toktype == NUMBER:
195 return token
196 return 'invalid token'
197 for lit in VALID_UNDERSCORE_LITERALS:
198 if '(' in lit:
199 # this won't work with compound complex inputs
200 continue
201 self.assertEqual(number_token(lit), lit)
202 for lit in INVALID_UNDERSCORE_LITERALS:
203 self.assertNotEqual(number_token(lit), lit)
204
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300205 def test_string(self):
206 # String literals
207 self.check_tokenize("x = ''; y = \"\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000208 NAME 'x' (1, 0) (1, 1)
209 OP '=' (1, 2) (1, 3)
210 STRING "''" (1, 4) (1, 6)
211 OP ';' (1, 6) (1, 7)
212 NAME 'y' (1, 8) (1, 9)
213 OP '=' (1, 10) (1, 11)
214 STRING '""' (1, 12) (1, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300215 """)
216 self.check_tokenize("x = '\"'; y = \"'\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000217 NAME 'x' (1, 0) (1, 1)
218 OP '=' (1, 2) (1, 3)
219 STRING '\\'"\\'' (1, 4) (1, 7)
220 OP ';' (1, 7) (1, 8)
221 NAME 'y' (1, 9) (1, 10)
222 OP '=' (1, 11) (1, 12)
223 STRING '"\\'"' (1, 13) (1, 16)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300224 """)
225 self.check_tokenize("x = \"doesn't \"shrink\", does it\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000226 NAME 'x' (1, 0) (1, 1)
227 OP '=' (1, 2) (1, 3)
228 STRING '"doesn\\'t "' (1, 4) (1, 14)
229 NAME 'shrink' (1, 14) (1, 20)
230 STRING '", does it"' (1, 20) (1, 31)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300231 """)
232 self.check_tokenize("x = 'abc' + 'ABC'", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000233 NAME 'x' (1, 0) (1, 1)
234 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000235 STRING "'abc'" (1, 4) (1, 9)
236 OP '+' (1, 10) (1, 11)
237 STRING "'ABC'" (1, 12) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300238 """)
239 self.check_tokenize('y = "ABC" + "ABC"', """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000240 NAME 'y' (1, 0) (1, 1)
241 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000242 STRING '"ABC"' (1, 4) (1, 9)
243 OP '+' (1, 10) (1, 11)
244 STRING '"ABC"' (1, 12) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300245 """)
246 self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000247 NAME 'x' (1, 0) (1, 1)
248 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000249 STRING "r'abc'" (1, 4) (1, 10)
250 OP '+' (1, 11) (1, 12)
251 STRING "r'ABC'" (1, 13) (1, 19)
252 OP '+' (1, 20) (1, 21)
253 STRING "R'ABC'" (1, 22) (1, 28)
254 OP '+' (1, 29) (1, 30)
255 STRING "R'ABC'" (1, 31) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300256 """)
257 self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000258 NAME 'y' (1, 0) (1, 1)
259 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000260 STRING 'r"abc"' (1, 4) (1, 10)
261 OP '+' (1, 11) (1, 12)
262 STRING 'r"ABC"' (1, 13) (1, 19)
263 OP '+' (1, 20) (1, 21)
264 STRING 'R"ABC"' (1, 22) (1, 28)
265 OP '+' (1, 29) (1, 30)
266 STRING 'R"ABC"' (1, 31) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300267 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000268
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300269 self.check_tokenize("u'abc' + U'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500270 STRING "u'abc'" (1, 0) (1, 6)
271 OP '+' (1, 7) (1, 8)
272 STRING "U'abc'" (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300273 """)
274 self.check_tokenize('u"abc" + U"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500275 STRING 'u"abc"' (1, 0) (1, 6)
276 OP '+' (1, 7) (1, 8)
277 STRING 'U"abc"' (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300278 """)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500279
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300280 self.check_tokenize("b'abc' + B'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500281 STRING "b'abc'" (1, 0) (1, 6)
282 OP '+' (1, 7) (1, 8)
283 STRING "B'abc'" (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300284 """)
285 self.check_tokenize('b"abc" + B"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500286 STRING 'b"abc"' (1, 0) (1, 6)
287 OP '+' (1, 7) (1, 8)
288 STRING 'B"abc"' (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300289 """)
290 self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500291 STRING "br'abc'" (1, 0) (1, 7)
292 OP '+' (1, 8) (1, 9)
293 STRING "bR'abc'" (1, 10) (1, 17)
294 OP '+' (1, 18) (1, 19)
295 STRING "Br'abc'" (1, 20) (1, 27)
296 OP '+' (1, 28) (1, 29)
297 STRING "BR'abc'" (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300298 """)
299 self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500300 STRING 'br"abc"' (1, 0) (1, 7)
301 OP '+' (1, 8) (1, 9)
302 STRING 'bR"abc"' (1, 10) (1, 17)
303 OP '+' (1, 18) (1, 19)
304 STRING 'Br"abc"' (1, 20) (1, 27)
305 OP '+' (1, 28) (1, 29)
306 STRING 'BR"abc"' (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300307 """)
308 self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500309 STRING "rb'abc'" (1, 0) (1, 7)
310 OP '+' (1, 8) (1, 9)
311 STRING "rB'abc'" (1, 10) (1, 17)
312 OP '+' (1, 18) (1, 19)
313 STRING "Rb'abc'" (1, 20) (1, 27)
314 OP '+' (1, 28) (1, 29)
315 STRING "RB'abc'" (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300316 """)
317 self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500318 STRING 'rb"abc"' (1, 0) (1, 7)
319 OP '+' (1, 8) (1, 9)
320 STRING 'rB"abc"' (1, 10) (1, 17)
321 OP '+' (1, 18) (1, 19)
322 STRING 'Rb"abc"' (1, 20) (1, 27)
323 OP '+' (1, 28) (1, 29)
324 STRING 'RB"abc"' (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300325 """)
Eric V. Smith67317742015-10-16 20:45:53 -0400326 # Check 0, 1, and 2 character string prefixes.
327 self.check_tokenize(r'"a\
328de\
329fg"', """\
330 STRING '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3)
331 """)
332 self.check_tokenize(r'u"a\
333de"', """\
334 STRING 'u"a\\\\\\nde"\' (1, 0) (2, 3)
335 """)
336 self.check_tokenize(r'rb"a\
337d"', """\
338 STRING 'rb"a\\\\\\nd"\' (1, 0) (2, 2)
339 """)
340 self.check_tokenize(r'"""a\
341b"""', """\
342 STRING '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
343 """)
344 self.check_tokenize(r'u"""a\
345b"""', """\
346 STRING 'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
347 """)
348 self.check_tokenize(r'rb"""a\
349b\
350c"""', """\
351 STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
352 """)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400353 self.check_tokenize('f"abc"', """\
354 STRING 'f"abc"' (1, 0) (1, 6)
355 """)
356 self.check_tokenize('fR"a{b}c"', """\
357 STRING 'fR"a{b}c"' (1, 0) (1, 9)
358 """)
359 self.check_tokenize('f"""abc"""', """\
360 STRING 'f\"\"\"abc\"\"\"' (1, 0) (1, 10)
361 """)
362 self.check_tokenize(r'f"abc\
363def"', """\
364 STRING 'f"abc\\\\\\ndef"' (1, 0) (2, 4)
365 """)
366 self.check_tokenize(r'Rf"abc\
367def"', """\
368 STRING 'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
369 """)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500370
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300371 def test_function(self):
372 self.check_tokenize("def d22(a, b, c=2, d=2, *k): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000373 NAME 'def' (1, 0) (1, 3)
374 NAME 'd22' (1, 4) (1, 7)
375 OP '(' (1, 7) (1, 8)
376 NAME 'a' (1, 8) (1, 9)
377 OP ',' (1, 9) (1, 10)
378 NAME 'b' (1, 11) (1, 12)
379 OP ',' (1, 12) (1, 13)
380 NAME 'c' (1, 14) (1, 15)
381 OP '=' (1, 15) (1, 16)
382 NUMBER '2' (1, 16) (1, 17)
383 OP ',' (1, 17) (1, 18)
384 NAME 'd' (1, 19) (1, 20)
385 OP '=' (1, 20) (1, 21)
386 NUMBER '2' (1, 21) (1, 22)
387 OP ',' (1, 22) (1, 23)
388 OP '*' (1, 24) (1, 25)
389 NAME 'k' (1, 25) (1, 26)
390 OP ')' (1, 26) (1, 27)
391 OP ':' (1, 27) (1, 28)
392 NAME 'pass' (1, 29) (1, 33)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300393 """)
394 self.check_tokenize("def d01v_(a=1, *k, **w): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000395 NAME 'def' (1, 0) (1, 3)
396 NAME 'd01v_' (1, 4) (1, 9)
397 OP '(' (1, 9) (1, 10)
398 NAME 'a' (1, 10) (1, 11)
399 OP '=' (1, 11) (1, 12)
400 NUMBER '1' (1, 12) (1, 13)
401 OP ',' (1, 13) (1, 14)
402 OP '*' (1, 15) (1, 16)
403 NAME 'k' (1, 16) (1, 17)
404 OP ',' (1, 17) (1, 18)
405 OP '**' (1, 19) (1, 21)
406 NAME 'w' (1, 21) (1, 22)
407 OP ')' (1, 22) (1, 23)
408 OP ':' (1, 23) (1, 24)
409 NAME 'pass' (1, 25) (1, 29)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300410 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000411
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300412 def test_comparison(self):
413 # Comparison
414 self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != "
415 "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000416 NAME 'if' (1, 0) (1, 2)
417 NUMBER '1' (1, 3) (1, 4)
418 OP '<' (1, 5) (1, 6)
419 NUMBER '1' (1, 7) (1, 8)
420 OP '>' (1, 9) (1, 10)
421 NUMBER '1' (1, 11) (1, 12)
422 OP '==' (1, 13) (1, 15)
423 NUMBER '1' (1, 16) (1, 17)
424 OP '>=' (1, 18) (1, 20)
425 NUMBER '5' (1, 21) (1, 22)
426 OP '<=' (1, 23) (1, 25)
427 NUMBER '0x15' (1, 26) (1, 30)
428 OP '<=' (1, 31) (1, 33)
429 NUMBER '0x12' (1, 34) (1, 38)
430 OP '!=' (1, 39) (1, 41)
431 NUMBER '1' (1, 42) (1, 43)
432 NAME 'and' (1, 44) (1, 47)
433 NUMBER '5' (1, 48) (1, 49)
434 NAME 'in' (1, 50) (1, 52)
435 NUMBER '1' (1, 53) (1, 54)
436 NAME 'not' (1, 55) (1, 58)
437 NAME 'in' (1, 59) (1, 61)
438 NUMBER '1' (1, 62) (1, 63)
439 NAME 'is' (1, 64) (1, 66)
440 NUMBER '1' (1, 67) (1, 68)
441 NAME 'or' (1, 69) (1, 71)
442 NUMBER '5' (1, 72) (1, 73)
443 NAME 'is' (1, 74) (1, 76)
444 NAME 'not' (1, 77) (1, 80)
445 NUMBER '1' (1, 81) (1, 82)
446 OP ':' (1, 82) (1, 83)
447 NAME 'pass' (1, 84) (1, 88)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300448 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000449
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300450 def test_shift(self):
451 # Shift
452 self.check_tokenize("x = 1 << 1 >> 5", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000453 NAME 'x' (1, 0) (1, 1)
454 OP '=' (1, 2) (1, 3)
455 NUMBER '1' (1, 4) (1, 5)
456 OP '<<' (1, 6) (1, 8)
457 NUMBER '1' (1, 9) (1, 10)
458 OP '>>' (1, 11) (1, 13)
459 NUMBER '5' (1, 14) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300460 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000461
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300462 def test_additive(self):
463 # Additive
464 self.check_tokenize("x = 1 - y + 15 - 1 + 0x124 + z + a[5]", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000465 NAME 'x' (1, 0) (1, 1)
466 OP '=' (1, 2) (1, 3)
467 NUMBER '1' (1, 4) (1, 5)
468 OP '-' (1, 6) (1, 7)
469 NAME 'y' (1, 8) (1, 9)
470 OP '+' (1, 10) (1, 11)
471 NUMBER '15' (1, 12) (1, 14)
472 OP '-' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000473 NUMBER '1' (1, 17) (1, 18)
474 OP '+' (1, 19) (1, 20)
475 NUMBER '0x124' (1, 21) (1, 26)
476 OP '+' (1, 27) (1, 28)
477 NAME 'z' (1, 29) (1, 30)
478 OP '+' (1, 31) (1, 32)
479 NAME 'a' (1, 33) (1, 34)
480 OP '[' (1, 34) (1, 35)
481 NUMBER '5' (1, 35) (1, 36)
482 OP ']' (1, 36) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300483 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000484
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300485 def test_multiplicative(self):
486 # Multiplicative
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300487 self.check_tokenize("x = 1//1*1/5*12%0x12@42", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000488 NAME 'x' (1, 0) (1, 1)
489 OP '=' (1, 2) (1, 3)
490 NUMBER '1' (1, 4) (1, 5)
491 OP '//' (1, 5) (1, 7)
492 NUMBER '1' (1, 7) (1, 8)
493 OP '*' (1, 8) (1, 9)
494 NUMBER '1' (1, 9) (1, 10)
495 OP '/' (1, 10) (1, 11)
496 NUMBER '5' (1, 11) (1, 12)
497 OP '*' (1, 12) (1, 13)
498 NUMBER '12' (1, 13) (1, 15)
499 OP '%' (1, 15) (1, 16)
500 NUMBER '0x12' (1, 16) (1, 20)
Benjamin Petersond51374e2014-04-09 23:55:56 -0400501 OP '@' (1, 20) (1, 21)
502 NUMBER '42' (1, 21) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300503 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000504
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300505 def test_unary(self):
506 # Unary
507 self.check_tokenize("~1 ^ 1 & 1 |1 ^ -1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000508 OP '~' (1, 0) (1, 1)
509 NUMBER '1' (1, 1) (1, 2)
510 OP '^' (1, 3) (1, 4)
511 NUMBER '1' (1, 5) (1, 6)
512 OP '&' (1, 7) (1, 8)
513 NUMBER '1' (1, 9) (1, 10)
514 OP '|' (1, 11) (1, 12)
515 NUMBER '1' (1, 12) (1, 13)
516 OP '^' (1, 14) (1, 15)
517 OP '-' (1, 16) (1, 17)
518 NUMBER '1' (1, 17) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300519 """)
520 self.check_tokenize("-1*1/1+1*1//1 - ---1**1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000521 OP '-' (1, 0) (1, 1)
522 NUMBER '1' (1, 1) (1, 2)
523 OP '*' (1, 2) (1, 3)
524 NUMBER '1' (1, 3) (1, 4)
525 OP '/' (1, 4) (1, 5)
526 NUMBER '1' (1, 5) (1, 6)
527 OP '+' (1, 6) (1, 7)
528 NUMBER '1' (1, 7) (1, 8)
529 OP '*' (1, 8) (1, 9)
530 NUMBER '1' (1, 9) (1, 10)
531 OP '//' (1, 10) (1, 12)
532 NUMBER '1' (1, 12) (1, 13)
533 OP '-' (1, 14) (1, 15)
534 OP '-' (1, 16) (1, 17)
535 OP '-' (1, 17) (1, 18)
536 OP '-' (1, 18) (1, 19)
537 NUMBER '1' (1, 19) (1, 20)
538 OP '**' (1, 20) (1, 22)
539 NUMBER '1' (1, 22) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300540 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000541
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300542 def test_selector(self):
543 # Selector
544 self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000545 NAME 'import' (1, 0) (1, 6)
546 NAME 'sys' (1, 7) (1, 10)
547 OP ',' (1, 10) (1, 11)
548 NAME 'time' (1, 12) (1, 16)
549 NEWLINE '\\n' (1, 16) (1, 17)
550 NAME 'x' (2, 0) (2, 1)
551 OP '=' (2, 2) (2, 3)
552 NAME 'sys' (2, 4) (2, 7)
553 OP '.' (2, 7) (2, 8)
554 NAME 'modules' (2, 8) (2, 15)
555 OP '[' (2, 15) (2, 16)
556 STRING "'time'" (2, 16) (2, 22)
557 OP ']' (2, 22) (2, 23)
558 OP '.' (2, 23) (2, 24)
559 NAME 'time' (2, 24) (2, 28)
560 OP '(' (2, 28) (2, 29)
561 OP ')' (2, 29) (2, 30)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300562 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000563
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300564 def test_method(self):
565 # Methods
566 self.check_tokenize("@staticmethod\ndef foo(x,y): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000567 OP '@' (1, 0) (1, 1)
Eric V. Smith67317742015-10-16 20:45:53 -0400568 NAME 'staticmethod' (1, 1) (1, 13)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000569 NEWLINE '\\n' (1, 13) (1, 14)
570 NAME 'def' (2, 0) (2, 3)
571 NAME 'foo' (2, 4) (2, 7)
572 OP '(' (2, 7) (2, 8)
573 NAME 'x' (2, 8) (2, 9)
574 OP ',' (2, 9) (2, 10)
575 NAME 'y' (2, 10) (2, 11)
576 OP ')' (2, 11) (2, 12)
577 OP ':' (2, 12) (2, 13)
578 NAME 'pass' (2, 14) (2, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300579 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000580
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300581 def test_tabs(self):
582 # Evil tabs
583 self.check_tokenize("def f():\n"
584 "\tif x\n"
585 " \tpass", """\
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000586 NAME 'def' (1, 0) (1, 3)
587 NAME 'f' (1, 4) (1, 5)
588 OP '(' (1, 5) (1, 6)
589 OP ')' (1, 6) (1, 7)
590 OP ':' (1, 7) (1, 8)
591 NEWLINE '\\n' (1, 8) (1, 9)
592 INDENT '\\t' (2, 0) (2, 1)
593 NAME 'if' (2, 1) (2, 3)
594 NAME 'x' (2, 4) (2, 5)
595 NEWLINE '\\n' (2, 5) (2, 6)
596 INDENT ' \\t' (3, 0) (3, 9)
597 NAME 'pass' (3, 9) (3, 13)
598 DEDENT '' (4, 0) (4, 0)
599 DEDENT '' (4, 0) (4, 0)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300600 """)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000601
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300602 def test_non_ascii_identifiers(self):
603 # Non-ascii identifiers
604 self.check_tokenize("Örter = 'places'\ngrün = 'green'", """\
Benjamin Peterson33856de2010-08-30 14:41:20 +0000605 NAME 'Örter' (1, 0) (1, 5)
606 OP '=' (1, 6) (1, 7)
607 STRING "'places'" (1, 8) (1, 16)
608 NEWLINE '\\n' (1, 16) (1, 17)
609 NAME 'grün' (2, 0) (2, 4)
610 OP '=' (2, 5) (2, 6)
611 STRING "'green'" (2, 7) (2, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300612 """)
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000613
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300614 def test_unicode(self):
615 # Legacy unicode literals:
616 self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000617 NAME 'Örter' (1, 0) (1, 5)
618 OP '=' (1, 6) (1, 7)
619 STRING "u'places'" (1, 8) (1, 17)
620 NEWLINE '\\n' (1, 17) (1, 18)
621 NAME 'grün' (2, 0) (2, 4)
622 OP '=' (2, 5) (2, 6)
Christian Heimes0b3847d2012-06-20 11:17:58 +0200623 STRING "U'green'" (2, 7) (2, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300624 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400625
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300626 def test_async(self):
627 # Async/await extension:
628 self.check_tokenize("async = 1", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400629 NAME 'async' (1, 0) (1, 5)
630 OP '=' (1, 6) (1, 7)
631 NUMBER '1' (1, 8) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300632 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400633
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300634 self.check_tokenize("a = (async = 1)", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400635 NAME 'a' (1, 0) (1, 1)
636 OP '=' (1, 2) (1, 3)
637 OP '(' (1, 4) (1, 5)
638 NAME 'async' (1, 5) (1, 10)
639 OP '=' (1, 11) (1, 12)
640 NUMBER '1' (1, 13) (1, 14)
641 OP ')' (1, 14) (1, 15)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300642 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400643
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300644 self.check_tokenize("async()", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400645 NAME 'async' (1, 0) (1, 5)
646 OP '(' (1, 5) (1, 6)
647 OP ')' (1, 6) (1, 7)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300648 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400649
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300650 self.check_tokenize("class async(Bar):pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400651 NAME 'class' (1, 0) (1, 5)
652 NAME 'async' (1, 6) (1, 11)
653 OP '(' (1, 11) (1, 12)
654 NAME 'Bar' (1, 12) (1, 15)
655 OP ')' (1, 15) (1, 16)
656 OP ':' (1, 16) (1, 17)
657 NAME 'pass' (1, 17) (1, 21)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300658 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400659
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300660 self.check_tokenize("class async:pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400661 NAME 'class' (1, 0) (1, 5)
662 NAME 'async' (1, 6) (1, 11)
663 OP ':' (1, 11) (1, 12)
664 NAME 'pass' (1, 12) (1, 16)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300665 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400666
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300667 self.check_tokenize("await = 1", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400668 NAME 'await' (1, 0) (1, 5)
669 OP '=' (1, 6) (1, 7)
670 NUMBER '1' (1, 8) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300671 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400672
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300673 self.check_tokenize("foo.async", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400674 NAME 'foo' (1, 0) (1, 3)
675 OP '.' (1, 3) (1, 4)
676 NAME 'async' (1, 4) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300677 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400678
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300679 self.check_tokenize("async for a in b: pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400680 NAME 'async' (1, 0) (1, 5)
681 NAME 'for' (1, 6) (1, 9)
682 NAME 'a' (1, 10) (1, 11)
683 NAME 'in' (1, 12) (1, 14)
684 NAME 'b' (1, 15) (1, 16)
685 OP ':' (1, 16) (1, 17)
686 NAME 'pass' (1, 18) (1, 22)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300687 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400688
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300689 self.check_tokenize("async with a as b: pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400690 NAME 'async' (1, 0) (1, 5)
691 NAME 'with' (1, 6) (1, 10)
692 NAME 'a' (1, 11) (1, 12)
693 NAME 'as' (1, 13) (1, 15)
694 NAME 'b' (1, 16) (1, 17)
695 OP ':' (1, 17) (1, 18)
696 NAME 'pass' (1, 19) (1, 23)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300697 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400698
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300699 self.check_tokenize("async.foo", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400700 NAME 'async' (1, 0) (1, 5)
701 OP '.' (1, 5) (1, 6)
702 NAME 'foo' (1, 6) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300703 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400704
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300705 self.check_tokenize("async", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400706 NAME 'async' (1, 0) (1, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300707 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400708
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300709 self.check_tokenize("async\n#comment\nawait", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400710 NAME 'async' (1, 0) (1, 5)
711 NEWLINE '\\n' (1, 5) (1, 6)
712 COMMENT '#comment' (2, 0) (2, 8)
713 NL '\\n' (2, 8) (2, 9)
714 NAME 'await' (3, 0) (3, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300715 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400716
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300717 self.check_tokenize("async\n...\nawait", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400718 NAME 'async' (1, 0) (1, 5)
719 NEWLINE '\\n' (1, 5) (1, 6)
720 OP '...' (2, 0) (2, 3)
721 NEWLINE '\\n' (2, 3) (2, 4)
722 NAME 'await' (3, 0) (3, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300723 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400724
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300725 self.check_tokenize("async\nawait", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400726 NAME 'async' (1, 0) (1, 5)
727 NEWLINE '\\n' (1, 5) (1, 6)
728 NAME 'await' (2, 0) (2, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300729 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400730
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300731 self.check_tokenize("foo.async + 1", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400732 NAME 'foo' (1, 0) (1, 3)
733 OP '.' (1, 3) (1, 4)
734 NAME 'async' (1, 4) (1, 9)
735 OP '+' (1, 10) (1, 11)
736 NUMBER '1' (1, 12) (1, 13)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300737 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400738
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300739 self.check_tokenize("async def foo(): pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400740 ASYNC 'async' (1, 0) (1, 5)
741 NAME 'def' (1, 6) (1, 9)
742 NAME 'foo' (1, 10) (1, 13)
743 OP '(' (1, 13) (1, 14)
744 OP ')' (1, 14) (1, 15)
745 OP ':' (1, 15) (1, 16)
746 NAME 'pass' (1, 17) (1, 21)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300747 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400748
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300749 self.check_tokenize('''\
750async def foo():
751 def foo(await):
752 await = 1
753 if 1:
754 await
755async += 1
756''', """\
Yury Selivanov75445082015-05-11 22:57:16 -0400757 ASYNC 'async' (1, 0) (1, 5)
758 NAME 'def' (1, 6) (1, 9)
759 NAME 'foo' (1, 10) (1, 13)
760 OP '(' (1, 13) (1, 14)
761 OP ')' (1, 14) (1, 15)
762 OP ':' (1, 15) (1, 16)
763 NEWLINE '\\n' (1, 16) (1, 17)
764 INDENT ' ' (2, 0) (2, 2)
765 NAME 'def' (2, 2) (2, 5)
766 NAME 'foo' (2, 6) (2, 9)
767 OP '(' (2, 9) (2, 10)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300768 AWAIT 'await' (2, 10) (2, 15)
Yury Selivanov75445082015-05-11 22:57:16 -0400769 OP ')' (2, 15) (2, 16)
770 OP ':' (2, 16) (2, 17)
771 NEWLINE '\\n' (2, 17) (2, 18)
772 INDENT ' ' (3, 0) (3, 4)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300773 AWAIT 'await' (3, 4) (3, 9)
Yury Selivanov75445082015-05-11 22:57:16 -0400774 OP '=' (3, 10) (3, 11)
775 NUMBER '1' (3, 12) (3, 13)
776 NEWLINE '\\n' (3, 13) (3, 14)
777 DEDENT '' (4, 2) (4, 2)
778 NAME 'if' (4, 2) (4, 4)
779 NUMBER '1' (4, 5) (4, 6)
780 OP ':' (4, 6) (4, 7)
781 NEWLINE '\\n' (4, 7) (4, 8)
782 INDENT ' ' (5, 0) (5, 4)
783 AWAIT 'await' (5, 4) (5, 9)
784 NEWLINE '\\n' (5, 9) (5, 10)
785 DEDENT '' (6, 0) (6, 0)
786 DEDENT '' (6, 0) (6, 0)
787 NAME 'async' (6, 0) (6, 5)
788 OP '+=' (6, 6) (6, 8)
789 NUMBER '1' (6, 9) (6, 10)
790 NEWLINE '\\n' (6, 10) (6, 11)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300791 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400792
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300793 self.check_tokenize('''\
794async def foo():
795 async for i in 1: pass''', """\
Yury Selivanov75445082015-05-11 22:57:16 -0400796 ASYNC 'async' (1, 0) (1, 5)
797 NAME 'def' (1, 6) (1, 9)
798 NAME 'foo' (1, 10) (1, 13)
799 OP '(' (1, 13) (1, 14)
800 OP ')' (1, 14) (1, 15)
801 OP ':' (1, 15) (1, 16)
802 NEWLINE '\\n' (1, 16) (1, 17)
803 INDENT ' ' (2, 0) (2, 2)
804 ASYNC 'async' (2, 2) (2, 7)
805 NAME 'for' (2, 8) (2, 11)
806 NAME 'i' (2, 12) (2, 13)
807 NAME 'in' (2, 14) (2, 16)
808 NUMBER '1' (2, 17) (2, 18)
809 OP ':' (2, 18) (2, 19)
810 NAME 'pass' (2, 20) (2, 24)
811 DEDENT '' (3, 0) (3, 0)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300812 """)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300813
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300814 self.check_tokenize('''async def foo(async): await''', """\
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300815 ASYNC 'async' (1, 0) (1, 5)
816 NAME 'def' (1, 6) (1, 9)
817 NAME 'foo' (1, 10) (1, 13)
818 OP '(' (1, 13) (1, 14)
819 ASYNC 'async' (1, 14) (1, 19)
820 OP ')' (1, 19) (1, 20)
821 OP ':' (1, 20) (1, 21)
822 AWAIT 'await' (1, 22) (1, 27)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300823 """)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300824
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300825 self.check_tokenize('''\
826def f():
827
828 def baz(): pass
829 async def bar(): pass
830
831 await = 2''', """\
Yury Selivanov96ec9342015-07-23 15:01:58 +0300832 NAME 'def' (1, 0) (1, 3)
833 NAME 'f' (1, 4) (1, 5)
834 OP '(' (1, 5) (1, 6)
835 OP ')' (1, 6) (1, 7)
836 OP ':' (1, 7) (1, 8)
837 NEWLINE '\\n' (1, 8) (1, 9)
838 NL '\\n' (2, 0) (2, 1)
839 INDENT ' ' (3, 0) (3, 2)
840 NAME 'def' (3, 2) (3, 5)
841 NAME 'baz' (3, 6) (3, 9)
842 OP '(' (3, 9) (3, 10)
843 OP ')' (3, 10) (3, 11)
844 OP ':' (3, 11) (3, 12)
845 NAME 'pass' (3, 13) (3, 17)
846 NEWLINE '\\n' (3, 17) (3, 18)
847 ASYNC 'async' (4, 2) (4, 7)
848 NAME 'def' (4, 8) (4, 11)
849 NAME 'bar' (4, 12) (4, 15)
850 OP '(' (4, 15) (4, 16)
851 OP ')' (4, 16) (4, 17)
852 OP ':' (4, 17) (4, 18)
853 NAME 'pass' (4, 19) (4, 23)
854 NEWLINE '\\n' (4, 23) (4, 24)
855 NL '\\n' (5, 0) (5, 1)
856 NAME 'await' (6, 2) (6, 7)
857 OP '=' (6, 8) (6, 9)
858 NUMBER '2' (6, 10) (6, 11)
859 DEDENT '' (7, 0) (7, 0)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300860 """)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300861
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300862 self.check_tokenize('''\
863async def f():
864
865 def baz(): pass
866 async def bar(): pass
867
868 await = 2''', """\
Yury Selivanov96ec9342015-07-23 15:01:58 +0300869 ASYNC 'async' (1, 0) (1, 5)
870 NAME 'def' (1, 6) (1, 9)
871 NAME 'f' (1, 10) (1, 11)
872 OP '(' (1, 11) (1, 12)
873 OP ')' (1, 12) (1, 13)
874 OP ':' (1, 13) (1, 14)
875 NEWLINE '\\n' (1, 14) (1, 15)
876 NL '\\n' (2, 0) (2, 1)
877 INDENT ' ' (3, 0) (3, 2)
878 NAME 'def' (3, 2) (3, 5)
879 NAME 'baz' (3, 6) (3, 9)
880 OP '(' (3, 9) (3, 10)
881 OP ')' (3, 10) (3, 11)
882 OP ':' (3, 11) (3, 12)
883 NAME 'pass' (3, 13) (3, 17)
884 NEWLINE '\\n' (3, 17) (3, 18)
885 ASYNC 'async' (4, 2) (4, 7)
886 NAME 'def' (4, 8) (4, 11)
887 NAME 'bar' (4, 12) (4, 15)
888 OP '(' (4, 15) (4, 16)
889 OP ')' (4, 16) (4, 17)
890 OP ':' (4, 17) (4, 18)
891 NAME 'pass' (4, 19) (4, 23)
892 NEWLINE '\\n' (4, 23) (4, 24)
893 NL '\\n' (5, 0) (5, 1)
894 AWAIT 'await' (6, 2) (6, 7)
895 OP '=' (6, 8) (6, 9)
896 NUMBER '2' (6, 10) (6, 11)
897 DEDENT '' (7, 0) (7, 0)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300898 """)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000899
Raymond Hettinger68c04532005-06-10 11:05:19 +0000900
Raymond Hettinger68c04532005-06-10 11:05:19 +0000901def decistmt(s):
Raymond Hettinger68c04532005-06-10 11:05:19 +0000902 result = []
Trent Nelson428de652008-03-18 22:41:35 +0000903 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
Raymond Hettinger68c04532005-06-10 11:05:19 +0000904 for toknum, tokval, _, _, _ in g:
905 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
906 result.extend([
907 (NAME, 'Decimal'),
908 (OP, '('),
909 (STRING, repr(tokval)),
910 (OP, ')')
911 ])
912 else:
913 result.append((toknum, tokval))
Trent Nelson428de652008-03-18 22:41:35 +0000914 return untokenize(result).decode('utf-8')
915
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300916class TestMisc(TestCase):
917
918 def test_decistmt(self):
919 # Substitute Decimals for floats in a string of statements.
920 # This is an example from the docs.
921
922 from decimal import Decimal
923 s = '+21.3e-5*-.1234/81.7'
924 self.assertEqual(decistmt(s),
925 "+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')")
926
927 # The format of the exponent is inherited from the platform C library.
928 # Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
929 # we're only showing 11 digits, and the 12th isn't close to 5, the
930 # rest of the output should be platform-independent.
931 self.assertRegex(repr(eval(s)), '-3.2171603427[0-9]*e-0+7')
932
933 # Output from calculations with Decimal should be identical across all
934 # platforms.
935 self.assertEqual(eval(decistmt(s)),
936 Decimal('-3.217160342717258261933904529E-7'))
937
Trent Nelson428de652008-03-18 22:41:35 +0000938
939class TestTokenizerAdheresToPep0263(TestCase):
940 """
941 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
942 """
943
944 def _testFile(self, filename):
945 path = os.path.join(os.path.dirname(__file__), filename)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300946 TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
Trent Nelson428de652008-03-18 22:41:35 +0000947
948 def test_utf8_coding_cookie_and_no_utf8_bom(self):
Ned Deily2ea6fcc2011-07-19 16:15:27 -0700949 f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300950 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +0000951
952 def test_latin1_coding_cookie_and_utf8_bom(self):
953 """
954 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
955 allowed encoding for the comment is 'utf-8'. The text file used in
956 this test starts with a BOM signature, but specifies latin1 as the
957 coding, so verify that a SyntaxError is raised, which matches the
958 behaviour of the interpreter when it encounters a similar condition.
959 """
960 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000961 self.assertRaises(SyntaxError, self._testFile, f)
Trent Nelson428de652008-03-18 22:41:35 +0000962
963 def test_no_coding_cookie_and_utf8_bom(self):
964 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300965 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +0000966
967 def test_utf8_coding_cookie_and_utf8_bom(self):
968 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300969 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +0000970
Florent Xicluna11f0b412012-07-07 12:13:35 +0200971 def test_bad_coding_cookie(self):
972 self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')
973 self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')
974
Trent Nelson428de652008-03-18 22:41:35 +0000975
976class Test_Tokenize(TestCase):
977
978 def test__tokenize_decodes_with_specified_encoding(self):
979 literal = '"ЉЊЈЁЂ"'
980 line = literal.encode('utf-8')
981 first = False
982 def readline():
983 nonlocal first
984 if not first:
985 first = True
986 return line
987 else:
988 return b''
989
990 # skip the initial encoding token and the end token
991 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
992 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000993 self.assertEqual(tokens, expected_tokens,
994 "bytes not decoded with encoding")
Trent Nelson428de652008-03-18 22:41:35 +0000995
996 def test__tokenize_does_not_decode_with_encoding_none(self):
997 literal = '"ЉЊЈЁЂ"'
998 first = False
999 def readline():
1000 nonlocal first
1001 if not first:
1002 first = True
1003 return literal
1004 else:
1005 return b''
1006
1007 # skip the end token
1008 tokens = list(_tokenize(readline, encoding=None))[:-1]
1009 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +00001010 self.assertEqual(tokens, expected_tokens,
1011 "string not tokenized when encoding is None")
Trent Nelson428de652008-03-18 22:41:35 +00001012
1013
1014class TestDetectEncoding(TestCase):
1015
1016 def get_readline(self, lines):
1017 index = 0
1018 def readline():
1019 nonlocal index
1020 if index == len(lines):
1021 raise StopIteration
1022 line = lines[index]
1023 index += 1
1024 return line
1025 return readline
1026
1027 def test_no_bom_no_encoding_cookie(self):
1028 lines = (
1029 b'# something\n',
1030 b'print(something)\n',
1031 b'do_something(else)\n'
1032 )
1033 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001034 self.assertEqual(encoding, 'utf-8')
1035 self.assertEqual(consumed_lines, list(lines[:2]))
Trent Nelson428de652008-03-18 22:41:35 +00001036
1037 def test_bom_no_cookie(self):
1038 lines = (
1039 b'\xef\xbb\xbf# something\n',
1040 b'print(something)\n',
1041 b'do_something(else)\n'
1042 )
1043 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001044 self.assertEqual(encoding, 'utf-8-sig')
1045 self.assertEqual(consumed_lines,
1046 [b'# something\n', b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001047
1048 def test_cookie_first_line_no_bom(self):
1049 lines = (
1050 b'# -*- coding: latin-1 -*-\n',
1051 b'print(something)\n',
1052 b'do_something(else)\n'
1053 )
1054 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001055 self.assertEqual(encoding, 'iso-8859-1')
1056 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001057
1058 def test_matched_bom_and_cookie_first_line(self):
1059 lines = (
1060 b'\xef\xbb\xbf# coding=utf-8\n',
1061 b'print(something)\n',
1062 b'do_something(else)\n'
1063 )
1064 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001065 self.assertEqual(encoding, 'utf-8-sig')
1066 self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001067
1068 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
1069 lines = (
1070 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
1071 b'print(something)\n',
1072 b'do_something(else)\n'
1073 )
1074 readline = self.get_readline(lines)
1075 self.assertRaises(SyntaxError, detect_encoding, readline)
1076
1077 def test_cookie_second_line_no_bom(self):
1078 lines = (
1079 b'#! something\n',
1080 b'# vim: set fileencoding=ascii :\n',
1081 b'print(something)\n',
1082 b'do_something(else)\n'
1083 )
1084 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001085 self.assertEqual(encoding, 'ascii')
Trent Nelson428de652008-03-18 22:41:35 +00001086 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
Ezio Melottib3aedd42010-11-20 19:04:17 +00001087 self.assertEqual(consumed_lines, expected)
Trent Nelson428de652008-03-18 22:41:35 +00001088
1089 def test_matched_bom_and_cookie_second_line(self):
1090 lines = (
1091 b'\xef\xbb\xbf#! something\n',
1092 b'f# coding=utf-8\n',
1093 b'print(something)\n',
1094 b'do_something(else)\n'
1095 )
1096 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001097 self.assertEqual(encoding, 'utf-8-sig')
1098 self.assertEqual(consumed_lines,
1099 [b'#! something\n', b'f# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001100
1101 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
1102 lines = (
1103 b'\xef\xbb\xbf#! something\n',
1104 b'# vim: set fileencoding=ascii :\n',
1105 b'print(something)\n',
1106 b'do_something(else)\n'
1107 )
1108 readline = self.get_readline(lines)
1109 self.assertRaises(SyntaxError, detect_encoding, readline)
1110
Serhiy Storchaka768c16c2014-01-09 18:36:09 +02001111 def test_cookie_second_line_noncommented_first_line(self):
1112 lines = (
1113 b"print('\xc2\xa3')\n",
1114 b'# vim: set fileencoding=iso8859-15 :\n',
1115 b"print('\xe2\x82\xac')\n"
1116 )
1117 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1118 self.assertEqual(encoding, 'utf-8')
1119 expected = [b"print('\xc2\xa3')\n"]
1120 self.assertEqual(consumed_lines, expected)
1121
1122 def test_cookie_second_line_commented_first_line(self):
1123 lines = (
1124 b"#print('\xc2\xa3')\n",
1125 b'# vim: set fileencoding=iso8859-15 :\n',
1126 b"print('\xe2\x82\xac')\n"
1127 )
1128 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1129 self.assertEqual(encoding, 'iso8859-15')
1130 expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
1131 self.assertEqual(consumed_lines, expected)
1132
1133 def test_cookie_second_line_empty_first_line(self):
1134 lines = (
1135 b'\n',
1136 b'# vim: set fileencoding=iso8859-15 :\n',
1137 b"print('\xe2\x82\xac')\n"
1138 )
1139 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1140 self.assertEqual(encoding, 'iso8859-15')
1141 expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
1142 self.assertEqual(consumed_lines, expected)
1143
Benjamin Petersond3afada2009-10-09 21:43:09 +00001144 def test_latin1_normalization(self):
1145 # See get_normal_name() in tokenizer.c.
1146 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
1147 "iso-8859-1-unix", "iso-latin-1-mac")
1148 for encoding in encodings:
1149 for rep in ("-", "_"):
1150 enc = encoding.replace("-", rep)
1151 lines = (b"#!/usr/bin/python\n",
1152 b"# coding: " + enc.encode("ascii") + b"\n",
1153 b"print(things)\n",
1154 b"do_something += 4\n")
1155 rl = self.get_readline(lines)
1156 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001157 self.assertEqual(found, "iso-8859-1")
Benjamin Petersond3afada2009-10-09 21:43:09 +00001158
Martin v. Löwis63674f42012-04-20 14:36:47 +02001159 def test_syntaxerror_latin1(self):
1160 # Issue 14629: need to raise SyntaxError if the first
1161 # line(s) have non-UTF-8 characters
1162 lines = (
1163 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1164 )
1165 readline = self.get_readline(lines)
1166 self.assertRaises(SyntaxError, detect_encoding, readline)
1167
1168
Benjamin Petersond3afada2009-10-09 21:43:09 +00001169 def test_utf8_normalization(self):
1170 # See get_normal_name() in tokenizer.c.
1171 encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
1172 for encoding in encodings:
1173 for rep in ("-", "_"):
1174 enc = encoding.replace("-", rep)
1175 lines = (b"#!/usr/bin/python\n",
1176 b"# coding: " + enc.encode("ascii") + b"\n",
1177 b"1 + 3\n")
1178 rl = self.get_readline(lines)
1179 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001180 self.assertEqual(found, "utf-8")
Benjamin Petersond3afada2009-10-09 21:43:09 +00001181
Trent Nelson428de652008-03-18 22:41:35 +00001182 def test_short_files(self):
1183 readline = self.get_readline((b'print(something)\n',))
1184 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001185 self.assertEqual(encoding, 'utf-8')
1186 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001187
1188 encoding, consumed_lines = detect_encoding(self.get_readline(()))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001189 self.assertEqual(encoding, 'utf-8')
1190 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +00001191
1192 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
1193 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001194 self.assertEqual(encoding, 'utf-8-sig')
1195 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001196
1197 readline = self.get_readline((b'\xef\xbb\xbf',))
1198 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001199 self.assertEqual(encoding, 'utf-8-sig')
1200 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +00001201
Benjamin Peterson433f32c2008-12-12 01:25:05 +00001202 readline = self.get_readline((b'# coding: bad\n',))
1203 self.assertRaises(SyntaxError, detect_encoding, readline)
Trent Nelson428de652008-03-18 22:41:35 +00001204
Serhiy Storchakadafea852013-09-16 23:51:56 +03001205 def test_false_encoding(self):
1206 # Issue 18873: "Encoding" detected in non-comment lines
1207 readline = self.get_readline((b'print("#coding=fake")',))
1208 encoding, consumed_lines = detect_encoding(readline)
1209 self.assertEqual(encoding, 'utf-8')
1210 self.assertEqual(consumed_lines, [b'print("#coding=fake")'])
1211
Victor Stinner58c07522010-11-09 01:08:59 +00001212 def test_open(self):
1213 filename = support.TESTFN + '.py'
1214 self.addCleanup(support.unlink, filename)
1215
1216 # test coding cookie
1217 for encoding in ('iso-8859-15', 'utf-8'):
1218 with open(filename, 'w', encoding=encoding) as fp:
1219 print("# coding: %s" % encoding, file=fp)
1220 print("print('euro:\u20ac')", file=fp)
1221 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001222 self.assertEqual(fp.encoding, encoding)
1223 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001224
1225 # test BOM (no coding cookie)
1226 with open(filename, 'w', encoding='utf-8-sig') as fp:
1227 print("print('euro:\u20ac')", file=fp)
1228 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001229 self.assertEqual(fp.encoding, 'utf-8-sig')
1230 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001231
Brett Cannonc33f3f22012-04-20 13:23:54 -04001232 def test_filename_in_exception(self):
1233 # When possible, include the file name in the exception.
1234 path = 'some_file_path'
1235 lines = (
1236 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1237 )
1238 class Bunk:
1239 def __init__(self, lines, path):
1240 self.name = path
1241 self._lines = lines
1242 self._index = 0
1243
1244 def readline(self):
1245 if self._index == len(lines):
1246 raise StopIteration
1247 line = lines[self._index]
1248 self._index += 1
1249 return line
1250
1251 with self.assertRaises(SyntaxError):
1252 ins = Bunk(lines, path)
1253 # Make sure lacking a name isn't an issue.
1254 del ins.name
1255 detect_encoding(ins.readline)
1256 with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
1257 ins = Bunk(lines, path)
1258 detect_encoding(ins.readline)
1259
Victor Stinner387729e2015-05-26 00:43:58 +02001260 def test_open_error(self):
1261 # Issue #23840: open() must close the binary file on error
1262 m = BytesIO(b'#coding:xxx')
1263 with mock.patch('tokenize._builtin_open', return_value=m):
1264 self.assertRaises(SyntaxError, tokenize_open, 'foobar')
1265 self.assertTrue(m.closed)
1266
1267
Trent Nelson428de652008-03-18 22:41:35 +00001268class TestTokenize(TestCase):
1269
1270 def test_tokenize(self):
1271 import tokenize as tokenize_module
1272 encoding = object()
1273 encoding_used = None
1274 def mock_detect_encoding(readline):
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001275 return encoding, [b'first', b'second']
Trent Nelson428de652008-03-18 22:41:35 +00001276
1277 def mock__tokenize(readline, encoding):
1278 nonlocal encoding_used
1279 encoding_used = encoding
1280 out = []
1281 while True:
1282 next_line = readline()
1283 if next_line:
1284 out.append(next_line)
1285 continue
1286 return out
1287
1288 counter = 0
1289 def mock_readline():
1290 nonlocal counter
1291 counter += 1
1292 if counter == 5:
1293 return b''
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001294 return str(counter).encode()
Trent Nelson428de652008-03-18 22:41:35 +00001295
1296 orig_detect_encoding = tokenize_module.detect_encoding
1297 orig__tokenize = tokenize_module._tokenize
1298 tokenize_module.detect_encoding = mock_detect_encoding
1299 tokenize_module._tokenize = mock__tokenize
1300 try:
1301 results = tokenize(mock_readline)
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001302 self.assertEqual(list(results),
1303 [b'first', b'second', b'1', b'2', b'3', b'4'])
Trent Nelson428de652008-03-18 22:41:35 +00001304 finally:
1305 tokenize_module.detect_encoding = orig_detect_encoding
1306 tokenize_module._tokenize = orig__tokenize
1307
1308 self.assertTrue(encoding_used, encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +00001309
Yury Selivanov8085b802015-05-18 12:50:52 -04001310 def test_oneline_defs(self):
1311 buf = []
1312 for i in range(500):
1313 buf.append('def i{i}(): return {i}'.format(i=i))
1314 buf.append('OK')
1315 buf = '\n'.join(buf)
1316
1317 # Test that 500 consequent, one-line defs is OK
1318 toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
1319 self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER
1320
Meador Inge00c7f852012-01-19 00:44:45 -06001321 def assertExactTypeEqual(self, opstr, *optypes):
1322 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
1323 num_optypes = len(optypes)
1324 self.assertEqual(len(tokens), 2 + num_optypes)
1325 self.assertEqual(token.tok_name[tokens[0].exact_type],
1326 token.tok_name[ENCODING])
1327 for i in range(num_optypes):
1328 self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
1329 token.tok_name[optypes[i]])
1330 self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
1331 token.tok_name[token.ENDMARKER])
1332
1333 def test_exact_type(self):
1334 self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
1335 self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
1336 self.assertExactTypeEqual(':', token.COLON)
1337 self.assertExactTypeEqual(',', token.COMMA)
1338 self.assertExactTypeEqual(';', token.SEMI)
1339 self.assertExactTypeEqual('+', token.PLUS)
1340 self.assertExactTypeEqual('-', token.MINUS)
1341 self.assertExactTypeEqual('*', token.STAR)
1342 self.assertExactTypeEqual('/', token.SLASH)
1343 self.assertExactTypeEqual('|', token.VBAR)
1344 self.assertExactTypeEqual('&', token.AMPER)
1345 self.assertExactTypeEqual('<', token.LESS)
1346 self.assertExactTypeEqual('>', token.GREATER)
1347 self.assertExactTypeEqual('=', token.EQUAL)
1348 self.assertExactTypeEqual('.', token.DOT)
1349 self.assertExactTypeEqual('%', token.PERCENT)
1350 self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
1351 self.assertExactTypeEqual('==', token.EQEQUAL)
1352 self.assertExactTypeEqual('!=', token.NOTEQUAL)
1353 self.assertExactTypeEqual('<=', token.LESSEQUAL)
1354 self.assertExactTypeEqual('>=', token.GREATEREQUAL)
1355 self.assertExactTypeEqual('~', token.TILDE)
1356 self.assertExactTypeEqual('^', token.CIRCUMFLEX)
1357 self.assertExactTypeEqual('<<', token.LEFTSHIFT)
1358 self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
1359 self.assertExactTypeEqual('**', token.DOUBLESTAR)
1360 self.assertExactTypeEqual('+=', token.PLUSEQUAL)
1361 self.assertExactTypeEqual('-=', token.MINEQUAL)
1362 self.assertExactTypeEqual('*=', token.STAREQUAL)
1363 self.assertExactTypeEqual('/=', token.SLASHEQUAL)
1364 self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
1365 self.assertExactTypeEqual('&=', token.AMPEREQUAL)
1366 self.assertExactTypeEqual('|=', token.VBAREQUAL)
1367 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1368 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1369 self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
1370 self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
1371 self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
1372 self.assertExactTypeEqual('//', token.DOUBLESLASH)
1373 self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
1374 self.assertExactTypeEqual('@', token.AT)
Benjamin Petersond51374e2014-04-09 23:55:56 -04001375 self.assertExactTypeEqual('@=', token.ATEQUAL)
Meador Inge00c7f852012-01-19 00:44:45 -06001376
1377 self.assertExactTypeEqual('a**2+b**2==c**2',
1378 NAME, token.DOUBLESTAR, NUMBER,
1379 token.PLUS,
1380 NAME, token.DOUBLESTAR, NUMBER,
1381 token.EQEQUAL,
1382 NAME, token.DOUBLESTAR, NUMBER)
1383 self.assertExactTypeEqual('{1, 2, 3}',
1384 token.LBRACE,
1385 token.NUMBER, token.COMMA,
1386 token.NUMBER, token.COMMA,
1387 token.NUMBER,
1388 token.RBRACE)
1389 self.assertExactTypeEqual('^(x & 0x1)',
1390 token.CIRCUMFLEX,
1391 token.LPAR,
1392 token.NAME, token.AMPER, token.NUMBER,
1393 token.RPAR)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001394
Ezio Melottifafa8b72012-11-03 17:46:51 +02001395 def test_pathological_trailing_whitespace(self):
1396 # See http://bugs.python.org/issue16152
1397 self.assertExactTypeEqual('@ ', token.AT)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001398
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001399
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001400class UntokenizeTest(TestCase):
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001401
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001402 def test_bad_input_order(self):
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001403 # raise if previous row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001404 u = Untokenizer()
1405 u.prev_row = 2
1406 u.prev_col = 2
1407 with self.assertRaises(ValueError) as cm:
1408 u.add_whitespace((1,3))
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001409 self.assertEqual(cm.exception.args[0],
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001410 'start (1,3) precedes previous end (2,2)')
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001411 # raise if previous column in row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001412 self.assertRaises(ValueError, u.add_whitespace, (2,1))
1413
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001414 def test_backslash_continuation(self):
1415 # The problem is that <whitespace>\<newline> leaves no token
1416 u = Untokenizer()
1417 u.prev_row = 1
1418 u.prev_col = 1
1419 u.tokens = []
1420 u.add_whitespace((2, 0))
1421 self.assertEqual(u.tokens, ['\\\n'])
1422 u.prev_row = 2
1423 u.add_whitespace((4, 4))
1424 self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' '])
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001425 TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n')
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001426
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -05001427 def test_iter_compat(self):
1428 u = Untokenizer()
1429 token = (NAME, 'Hello')
1430 tokens = [(ENCODING, 'utf-8'), token]
1431 u.compat(token, iter([]))
1432 self.assertEqual(u.tokens, ["Hello "])
1433 u = Untokenizer()
1434 self.assertEqual(u.untokenize(iter([token])), 'Hello ')
1435 u = Untokenizer()
1436 self.assertEqual(u.untokenize(iter(tokens)), 'Hello ')
1437 self.assertEqual(u.encoding, 'utf-8')
1438 self.assertEqual(untokenize(iter(tokens)), b'Hello ')
1439
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001440
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001441class TestRoundtrip(TestCase):
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001442
1443 def check_roundtrip(self, f):
1444 """
1445 Test roundtrip for `untokenize`. `f` is an open file or a string.
1446 The source code in f is tokenized to both 5- and 2-tuples.
1447 Both sequences are converted back to source code via
1448 tokenize.untokenize(), and the latter tokenized again to 2-tuples.
1449 The test fails if the 3 pair tokenizations do not match.
1450
1451 When untokenize bugs are fixed, untokenize with 5-tuples should
1452 reproduce code that does not contain a backslash continuation
1453 following spaces. A proper test should test this.
1454 """
1455 # Get source code and original tokenizations
1456 if isinstance(f, str):
1457 code = f.encode('utf-8')
1458 else:
1459 code = f.read()
1460 f.close()
1461 readline = iter(code.splitlines(keepends=True)).__next__
1462 tokens5 = list(tokenize(readline))
1463 tokens2 = [tok[:2] for tok in tokens5]
1464 # Reproduce tokens2 from pairs
1465 bytes_from2 = untokenize(tokens2)
1466 readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__
1467 tokens2_from2 = [tok[:2] for tok in tokenize(readline2)]
1468 self.assertEqual(tokens2_from2, tokens2)
1469 # Reproduce tokens2 from 5-tuples
1470 bytes_from5 = untokenize(tokens5)
1471 readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__
1472 tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
1473 self.assertEqual(tokens2_from5, tokens2)
1474
1475 def test_roundtrip(self):
1476 # There are some standard formatting practices that are easy to get right.
1477
1478 self.check_roundtrip("if x == 1:\n"
1479 " print(x)\n")
1480 self.check_roundtrip("# This is a comment\n"
1481 "# This also")
1482
1483 # Some people use different formatting conventions, which makes
1484 # untokenize a little trickier. Note that this test involves trailing
1485 # whitespace after the colon. Note that we use hex escapes to make the
1486 # two trailing blanks apparent in the expected output.
1487
1488 self.check_roundtrip("if x == 1 : \n"
1489 " print(x)\n")
1490 fn = support.findfile("tokenize_tests.txt")
1491 with open(fn, 'rb') as f:
1492 self.check_roundtrip(f)
1493 self.check_roundtrip("if x == 1:\n"
1494 " # A comment by itself.\n"
1495 " print(x) # Comment here, too.\n"
1496 " # Another comment.\n"
1497 "after_if = True\n")
1498 self.check_roundtrip("if (x # The comments need to go in the right place\n"
1499 " == 1):\n"
1500 " print('x==1')\n")
1501 self.check_roundtrip("class Test: # A comment here\n"
1502 " # A comment with weird indent\n"
1503 " after_com = 5\n"
1504 " def x(m): return m*5 # a one liner\n"
1505 " def y(m): # A whitespace after the colon\n"
1506 " return y*4 # 3-space indent\n")
1507
1508 # Some error-handling code
1509 self.check_roundtrip("try: import somemodule\n"
1510 "except ImportError: # comment\n"
1511 " print('Can not import' # comment2\n)"
1512 "else: print('Loaded')\n")
1513
1514 def test_continuation(self):
1515 # Balancing continuation
1516 self.check_roundtrip("a = (3,4, \n"
1517 "5,6)\n"
1518 "y = [3, 4,\n"
1519 "5]\n"
1520 "z = {'a': 5,\n"
1521 "'b':15, 'c':True}\n"
1522 "x = len(y) + 5 - a[\n"
1523 "3] - a[2]\n"
1524 "+ len(z) - z[\n"
1525 "'b']\n")
1526
1527 def test_backslash_continuation(self):
1528 # Backslash means line continuation, except for comments
1529 self.check_roundtrip("x=1+\\\n"
1530 "1\n"
1531 "# This is a comment\\\n"
1532 "# This also\n")
1533 self.check_roundtrip("# Comment \\\n"
1534 "x = 0")
1535
1536 def test_string_concatenation(self):
1537 # Two string literals on the same line
1538 self.check_roundtrip("'' ''")
1539
1540 def test_random_files(self):
1541 # Test roundtrip on random python modules.
1542 # pass the '-ucpu' option to process the full directory.
1543
1544 import glob, random
1545 fn = support.findfile("tokenize_tests.txt")
1546 tempdir = os.path.dirname(fn) or os.curdir
1547 testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
1548
Brett Cannona721aba2016-09-09 14:57:09 -07001549 # Tokenize is broken on test_pep3131.py because regular expressions are
1550 # broken on the obscure unicode identifiers in it. *sigh*
1551 # With roundtrip extended to test the 5-tuple mode of untokenize,
1552 # 7 more testfiles fail. Remove them also until the failure is diagnosed.
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001553
Zachary Ware724f6a62016-09-09 12:55:37 -07001554 testfiles.remove(os.path.join(tempdir, "test_unicode_identifiers.py"))
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001555 for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'):
1556 testfiles.remove(os.path.join(tempdir, "test_%s.py") % f)
1557
1558 if not support.is_resource_enabled("cpu"):
1559 testfiles = random.sample(testfiles, 10)
1560
1561 for testfile in testfiles:
1562 with open(testfile, 'rb') as f:
1563 with self.subTest(file=testfile):
1564 self.check_roundtrip(f)
1565
1566
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001567 def roundtrip(self, code):
1568 if isinstance(code, str):
1569 code = code.encode('utf-8')
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001570 return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001571
1572 def test_indentation_semantics_retained(self):
1573 """
1574 Ensure that although whitespace might be mutated in a roundtrip,
1575 the semantic meaning of the indentation remains consistent.
1576 """
1577 code = "if False:\n\tx=3\n\tx=3\n"
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001578 codelines = self.roundtrip(code).split('\n')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001579 self.assertEqual(codelines[1], codelines[2])
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001580 self.check_roundtrip(code)
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001581
1582
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001583if __name__ == "__main__":
Brett Cannona721aba2016-09-09 14:57:09 -07001584 unittest.main()