blob: 6b57777f61f6f2d1644b183e8bf853674d4fb9fc [file] [log] [blame]
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001from test import support
2from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
3 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
4 open as tokenize_open, Untokenizer)
5from io import BytesIO
Brett Cannona721aba2016-09-09 14:57:09 -07006from unittest import TestCase, mock
7from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
8 INVALID_UNDERSCORE_LITERALS)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03009import os
10import token
Thomas Wouters89f507f2006-12-13 04:49:30 +000011
Thomas Wouters89f507f2006-12-13 04:49:30 +000012
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030013class TokenizeTest(TestCase):
14 # Tests for the tokenize module.
Jason R. Coombs7cf36382015-06-20 19:13:50 -040015
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030016 # The tests can be really simple. Given a small fragment of source
17 # code, print out a table with tokens. The ENDMARKER is omitted for
18 # brevity.
19
20 def check_tokenize(self, s, expected):
21 # Format the tokens in s in a table format.
22 # The ENDMARKER is omitted.
23 result = []
24 f = BytesIO(s.encode('utf-8'))
25 for type, token, start, end, line in tokenize(f.readline):
26 if type == ENDMARKER:
27 break
28 type = tok_name[type]
Eric V. Smith67317742015-10-16 20:45:53 -040029 result.append(f" {type:10} {token!r:13} {start} {end}")
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030030 self.assertEqual(result,
31 [" ENCODING 'utf-8' (0, 0) (0, 0)"] +
32 expected.rstrip().splitlines())
33
34 def test_basic(self):
35 self.check_tokenize("1 + 1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000036 NUMBER '1' (1, 0) (1, 1)
37 OP '+' (1, 2) (1, 3)
38 NUMBER '1' (1, 4) (1, 5)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030039 """)
40 self.check_tokenize("if False:\n"
41 " # NL\n"
42 " True = False # NEWLINE\n", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000043 NAME 'if' (1, 0) (1, 2)
44 NAME 'False' (1, 3) (1, 8)
45 OP ':' (1, 8) (1, 9)
46 NEWLINE '\\n' (1, 9) (1, 10)
47 COMMENT '# NL' (2, 4) (2, 8)
48 NL '\\n' (2, 8) (2, 9)
49 INDENT ' ' (3, 0) (3, 4)
50 NAME 'True' (3, 4) (3, 8)
51 OP '=' (3, 9) (3, 10)
52 NAME 'False' (3, 11) (3, 16)
53 COMMENT '# NEWLINE' (3, 17) (3, 26)
54 NEWLINE '\\n' (3, 26) (3, 27)
55 DEDENT '' (4, 0) (4, 0)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030056 """)
57 indent_error_file = b"""\
58def k(x):
59 x += 2
60 x += 5
61"""
62 readline = BytesIO(indent_error_file).readline
63 with self.assertRaisesRegex(IndentationError,
64 "unindent does not match any "
65 "outer indentation level"):
66 for tok in tokenize(readline):
67 pass
Thomas Wouters89f507f2006-12-13 04:49:30 +000068
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030069 def test_int(self):
70 # Ordinary integers and binary operators
71 self.check_tokenize("0xff <= 255", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000072 NUMBER '0xff' (1, 0) (1, 4)
73 OP '<=' (1, 5) (1, 7)
74 NUMBER '255' (1, 8) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030075 """)
76 self.check_tokenize("0b10 <= 255", """\
Eric Smith74ca5572008-03-17 19:49:19 +000077 NUMBER '0b10' (1, 0) (1, 4)
78 OP '<=' (1, 5) (1, 7)
79 NUMBER '255' (1, 8) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030080 """)
81 self.check_tokenize("0o123 <= 0O123", """\
Eric Smith74ca5572008-03-17 19:49:19 +000082 NUMBER '0o123' (1, 0) (1, 5)
83 OP '<=' (1, 6) (1, 8)
84 NUMBER '0O123' (1, 9) (1, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030085 """)
86 self.check_tokenize("1234567 > ~0x15", """\
Mark Dickinson0c1f7c02008-03-16 05:05:12 +000087 NUMBER '1234567' (1, 0) (1, 7)
88 OP '>' (1, 8) (1, 9)
89 OP '~' (1, 10) (1, 11)
90 NUMBER '0x15' (1, 11) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030091 """)
92 self.check_tokenize("2134568 != 1231515", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000093 NUMBER '2134568' (1, 0) (1, 7)
94 OP '!=' (1, 8) (1, 10)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +000095 NUMBER '1231515' (1, 11) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030096 """)
97 self.check_tokenize("(-124561-1) & 200000000", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000098 OP '(' (1, 0) (1, 1)
99 OP '-' (1, 1) (1, 2)
100 NUMBER '124561' (1, 2) (1, 8)
101 OP '-' (1, 8) (1, 9)
102 NUMBER '1' (1, 9) (1, 10)
103 OP ')' (1, 10) (1, 11)
104 OP '&' (1, 12) (1, 13)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000105 NUMBER '200000000' (1, 14) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300106 """)
107 self.check_tokenize("0xdeadbeef != -1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000108 NUMBER '0xdeadbeef' (1, 0) (1, 10)
109 OP '!=' (1, 11) (1, 13)
110 OP '-' (1, 14) (1, 15)
111 NUMBER '1' (1, 15) (1, 16)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300112 """)
113 self.check_tokenize("0xdeadc0de & 12345", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000114 NUMBER '0xdeadc0de' (1, 0) (1, 10)
115 OP '&' (1, 11) (1, 12)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000116 NUMBER '12345' (1, 13) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300117 """)
118 self.check_tokenize("0xFF & 0x15 | 1234", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000119 NUMBER '0xFF' (1, 0) (1, 4)
120 OP '&' (1, 5) (1, 6)
121 NUMBER '0x15' (1, 7) (1, 11)
122 OP '|' (1, 12) (1, 13)
123 NUMBER '1234' (1, 14) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300124 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000125
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300126 def test_long(self):
127 # Long integers
128 self.check_tokenize("x = 0", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000129 NAME 'x' (1, 0) (1, 1)
130 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000131 NUMBER '0' (1, 4) (1, 5)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300132 """)
133 self.check_tokenize("x = 0xfffffffffff", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000134 NAME 'x' (1, 0) (1, 1)
135 OP '=' (1, 2) (1, 3)
Eric V. Smith67317742015-10-16 20:45:53 -0400136 NUMBER '0xfffffffffff' (1, 4) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300137 """)
138 self.check_tokenize("x = 123141242151251616110", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000139 NAME 'x' (1, 0) (1, 1)
140 OP '=' (1, 2) (1, 3)
Eric V. Smith67317742015-10-16 20:45:53 -0400141 NUMBER '123141242151251616110' (1, 4) (1, 25)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300142 """)
143 self.check_tokenize("x = -15921590215012591", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000144 NAME 'x' (1, 0) (1, 1)
145 OP '=' (1, 2) (1, 3)
146 OP '-' (1, 4) (1, 5)
Eric V. Smith67317742015-10-16 20:45:53 -0400147 NUMBER '15921590215012591' (1, 5) (1, 22)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300148 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000149
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300150 def test_float(self):
151 # Floating point numbers
152 self.check_tokenize("x = 3.14159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000153 NAME 'x' (1, 0) (1, 1)
154 OP '=' (1, 2) (1, 3)
155 NUMBER '3.14159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300156 """)
157 self.check_tokenize("x = 314159.", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000158 NAME 'x' (1, 0) (1, 1)
159 OP '=' (1, 2) (1, 3)
160 NUMBER '314159.' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300161 """)
162 self.check_tokenize("x = .314159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000163 NAME 'x' (1, 0) (1, 1)
164 OP '=' (1, 2) (1, 3)
165 NUMBER '.314159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300166 """)
167 self.check_tokenize("x = 3e14159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000168 NAME 'x' (1, 0) (1, 1)
169 OP '=' (1, 2) (1, 3)
170 NUMBER '3e14159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300171 """)
172 self.check_tokenize("x = 3E123", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000173 NAME 'x' (1, 0) (1, 1)
174 OP '=' (1, 2) (1, 3)
175 NUMBER '3E123' (1, 4) (1, 9)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300176 """)
177 self.check_tokenize("x+y = 3e-1230", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000178 NAME 'x' (1, 0) (1, 1)
179 OP '+' (1, 1) (1, 2)
180 NAME 'y' (1, 2) (1, 3)
181 OP '=' (1, 4) (1, 5)
182 NUMBER '3e-1230' (1, 6) (1, 13)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300183 """)
184 self.check_tokenize("x = 3.14e159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000185 NAME 'x' (1, 0) (1, 1)
186 OP '=' (1, 2) (1, 3)
187 NUMBER '3.14e159' (1, 4) (1, 12)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300188 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000189
Brett Cannona721aba2016-09-09 14:57:09 -0700190 def test_underscore_literals(self):
191 def number_token(s):
192 f = BytesIO(s.encode('utf-8'))
193 for toktype, token, start, end, line in tokenize(f.readline):
194 if toktype == NUMBER:
195 return token
196 return 'invalid token'
197 for lit in VALID_UNDERSCORE_LITERALS:
198 if '(' in lit:
199 # this won't work with compound complex inputs
200 continue
201 self.assertEqual(number_token(lit), lit)
202 for lit in INVALID_UNDERSCORE_LITERALS:
203 self.assertNotEqual(number_token(lit), lit)
204
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300205 def test_string(self):
206 # String literals
207 self.check_tokenize("x = ''; y = \"\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000208 NAME 'x' (1, 0) (1, 1)
209 OP '=' (1, 2) (1, 3)
210 STRING "''" (1, 4) (1, 6)
211 OP ';' (1, 6) (1, 7)
212 NAME 'y' (1, 8) (1, 9)
213 OP '=' (1, 10) (1, 11)
214 STRING '""' (1, 12) (1, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300215 """)
216 self.check_tokenize("x = '\"'; y = \"'\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000217 NAME 'x' (1, 0) (1, 1)
218 OP '=' (1, 2) (1, 3)
219 STRING '\\'"\\'' (1, 4) (1, 7)
220 OP ';' (1, 7) (1, 8)
221 NAME 'y' (1, 9) (1, 10)
222 OP '=' (1, 11) (1, 12)
223 STRING '"\\'"' (1, 13) (1, 16)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300224 """)
225 self.check_tokenize("x = \"doesn't \"shrink\", does it\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000226 NAME 'x' (1, 0) (1, 1)
227 OP '=' (1, 2) (1, 3)
228 STRING '"doesn\\'t "' (1, 4) (1, 14)
229 NAME 'shrink' (1, 14) (1, 20)
230 STRING '", does it"' (1, 20) (1, 31)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300231 """)
232 self.check_tokenize("x = 'abc' + 'ABC'", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000233 NAME 'x' (1, 0) (1, 1)
234 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000235 STRING "'abc'" (1, 4) (1, 9)
236 OP '+' (1, 10) (1, 11)
237 STRING "'ABC'" (1, 12) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300238 """)
239 self.check_tokenize('y = "ABC" + "ABC"', """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000240 NAME 'y' (1, 0) (1, 1)
241 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000242 STRING '"ABC"' (1, 4) (1, 9)
243 OP '+' (1, 10) (1, 11)
244 STRING '"ABC"' (1, 12) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300245 """)
246 self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000247 NAME 'x' (1, 0) (1, 1)
248 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000249 STRING "r'abc'" (1, 4) (1, 10)
250 OP '+' (1, 11) (1, 12)
251 STRING "r'ABC'" (1, 13) (1, 19)
252 OP '+' (1, 20) (1, 21)
253 STRING "R'ABC'" (1, 22) (1, 28)
254 OP '+' (1, 29) (1, 30)
255 STRING "R'ABC'" (1, 31) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300256 """)
257 self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000258 NAME 'y' (1, 0) (1, 1)
259 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000260 STRING 'r"abc"' (1, 4) (1, 10)
261 OP '+' (1, 11) (1, 12)
262 STRING 'r"ABC"' (1, 13) (1, 19)
263 OP '+' (1, 20) (1, 21)
264 STRING 'R"ABC"' (1, 22) (1, 28)
265 OP '+' (1, 29) (1, 30)
266 STRING 'R"ABC"' (1, 31) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300267 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000268
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300269 self.check_tokenize("u'abc' + U'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500270 STRING "u'abc'" (1, 0) (1, 6)
271 OP '+' (1, 7) (1, 8)
272 STRING "U'abc'" (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300273 """)
274 self.check_tokenize('u"abc" + U"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500275 STRING 'u"abc"' (1, 0) (1, 6)
276 OP '+' (1, 7) (1, 8)
277 STRING 'U"abc"' (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300278 """)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500279
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300280 self.check_tokenize("b'abc' + B'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500281 STRING "b'abc'" (1, 0) (1, 6)
282 OP '+' (1, 7) (1, 8)
283 STRING "B'abc'" (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300284 """)
285 self.check_tokenize('b"abc" + B"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500286 STRING 'b"abc"' (1, 0) (1, 6)
287 OP '+' (1, 7) (1, 8)
288 STRING 'B"abc"' (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300289 """)
290 self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500291 STRING "br'abc'" (1, 0) (1, 7)
292 OP '+' (1, 8) (1, 9)
293 STRING "bR'abc'" (1, 10) (1, 17)
294 OP '+' (1, 18) (1, 19)
295 STRING "Br'abc'" (1, 20) (1, 27)
296 OP '+' (1, 28) (1, 29)
297 STRING "BR'abc'" (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300298 """)
299 self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500300 STRING 'br"abc"' (1, 0) (1, 7)
301 OP '+' (1, 8) (1, 9)
302 STRING 'bR"abc"' (1, 10) (1, 17)
303 OP '+' (1, 18) (1, 19)
304 STRING 'Br"abc"' (1, 20) (1, 27)
305 OP '+' (1, 28) (1, 29)
306 STRING 'BR"abc"' (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300307 """)
308 self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500309 STRING "rb'abc'" (1, 0) (1, 7)
310 OP '+' (1, 8) (1, 9)
311 STRING "rB'abc'" (1, 10) (1, 17)
312 OP '+' (1, 18) (1, 19)
313 STRING "Rb'abc'" (1, 20) (1, 27)
314 OP '+' (1, 28) (1, 29)
315 STRING "RB'abc'" (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300316 """)
317 self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500318 STRING 'rb"abc"' (1, 0) (1, 7)
319 OP '+' (1, 8) (1, 9)
320 STRING 'rB"abc"' (1, 10) (1, 17)
321 OP '+' (1, 18) (1, 19)
322 STRING 'Rb"abc"' (1, 20) (1, 27)
323 OP '+' (1, 28) (1, 29)
324 STRING 'RB"abc"' (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300325 """)
Eric V. Smith67317742015-10-16 20:45:53 -0400326 # Check 0, 1, and 2 character string prefixes.
327 self.check_tokenize(r'"a\
328de\
329fg"', """\
330 STRING '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3)
331 """)
332 self.check_tokenize(r'u"a\
333de"', """\
334 STRING 'u"a\\\\\\nde"\' (1, 0) (2, 3)
335 """)
336 self.check_tokenize(r'rb"a\
337d"', """\
338 STRING 'rb"a\\\\\\nd"\' (1, 0) (2, 2)
339 """)
340 self.check_tokenize(r'"""a\
341b"""', """\
342 STRING '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
343 """)
344 self.check_tokenize(r'u"""a\
345b"""', """\
346 STRING 'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
347 """)
348 self.check_tokenize(r'rb"""a\
349b\
350c"""', """\
351 STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
352 """)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400353 self.check_tokenize('f"abc"', """\
354 STRING 'f"abc"' (1, 0) (1, 6)
355 """)
356 self.check_tokenize('fR"a{b}c"', """\
357 STRING 'fR"a{b}c"' (1, 0) (1, 9)
358 """)
359 self.check_tokenize('f"""abc"""', """\
360 STRING 'f\"\"\"abc\"\"\"' (1, 0) (1, 10)
361 """)
362 self.check_tokenize(r'f"abc\
363def"', """\
364 STRING 'f"abc\\\\\\ndef"' (1, 0) (2, 4)
365 """)
366 self.check_tokenize(r'Rf"abc\
367def"', """\
368 STRING 'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
369 """)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500370
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300371 def test_function(self):
372 self.check_tokenize("def d22(a, b, c=2, d=2, *k): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000373 NAME 'def' (1, 0) (1, 3)
374 NAME 'd22' (1, 4) (1, 7)
375 OP '(' (1, 7) (1, 8)
376 NAME 'a' (1, 8) (1, 9)
377 OP ',' (1, 9) (1, 10)
378 NAME 'b' (1, 11) (1, 12)
379 OP ',' (1, 12) (1, 13)
380 NAME 'c' (1, 14) (1, 15)
381 OP '=' (1, 15) (1, 16)
382 NUMBER '2' (1, 16) (1, 17)
383 OP ',' (1, 17) (1, 18)
384 NAME 'd' (1, 19) (1, 20)
385 OP '=' (1, 20) (1, 21)
386 NUMBER '2' (1, 21) (1, 22)
387 OP ',' (1, 22) (1, 23)
388 OP '*' (1, 24) (1, 25)
389 NAME 'k' (1, 25) (1, 26)
390 OP ')' (1, 26) (1, 27)
391 OP ':' (1, 27) (1, 28)
392 NAME 'pass' (1, 29) (1, 33)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300393 """)
394 self.check_tokenize("def d01v_(a=1, *k, **w): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000395 NAME 'def' (1, 0) (1, 3)
396 NAME 'd01v_' (1, 4) (1, 9)
397 OP '(' (1, 9) (1, 10)
398 NAME 'a' (1, 10) (1, 11)
399 OP '=' (1, 11) (1, 12)
400 NUMBER '1' (1, 12) (1, 13)
401 OP ',' (1, 13) (1, 14)
402 OP '*' (1, 15) (1, 16)
403 NAME 'k' (1, 16) (1, 17)
404 OP ',' (1, 17) (1, 18)
405 OP '**' (1, 19) (1, 21)
406 NAME 'w' (1, 21) (1, 22)
407 OP ')' (1, 22) (1, 23)
408 OP ':' (1, 23) (1, 24)
409 NAME 'pass' (1, 25) (1, 29)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300410 """)
Jim Fasarakis-Hilliardd4914e92017-03-14 22:16:15 +0200411 self.check_tokenize("def d23(a: str, b: int=3) -> int: pass", """\
412 NAME 'def' (1, 0) (1, 3)
413 NAME 'd23' (1, 4) (1, 7)
414 OP '(' (1, 7) (1, 8)
415 NAME 'a' (1, 8) (1, 9)
416 OP ':' (1, 9) (1, 10)
417 NAME 'str' (1, 11) (1, 14)
418 OP ',' (1, 14) (1, 15)
419 NAME 'b' (1, 16) (1, 17)
420 OP ':' (1, 17) (1, 18)
421 NAME 'int' (1, 19) (1, 22)
422 OP '=' (1, 22) (1, 23)
423 NUMBER '3' (1, 23) (1, 24)
424 OP ')' (1, 24) (1, 25)
425 OP '->' (1, 26) (1, 28)
426 NAME 'int' (1, 29) (1, 32)
427 OP ':' (1, 32) (1, 33)
428 NAME 'pass' (1, 34) (1, 38)
429 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000430
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300431 def test_comparison(self):
432 # Comparison
433 self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != "
434 "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000435 NAME 'if' (1, 0) (1, 2)
436 NUMBER '1' (1, 3) (1, 4)
437 OP '<' (1, 5) (1, 6)
438 NUMBER '1' (1, 7) (1, 8)
439 OP '>' (1, 9) (1, 10)
440 NUMBER '1' (1, 11) (1, 12)
441 OP '==' (1, 13) (1, 15)
442 NUMBER '1' (1, 16) (1, 17)
443 OP '>=' (1, 18) (1, 20)
444 NUMBER '5' (1, 21) (1, 22)
445 OP '<=' (1, 23) (1, 25)
446 NUMBER '0x15' (1, 26) (1, 30)
447 OP '<=' (1, 31) (1, 33)
448 NUMBER '0x12' (1, 34) (1, 38)
449 OP '!=' (1, 39) (1, 41)
450 NUMBER '1' (1, 42) (1, 43)
451 NAME 'and' (1, 44) (1, 47)
452 NUMBER '5' (1, 48) (1, 49)
453 NAME 'in' (1, 50) (1, 52)
454 NUMBER '1' (1, 53) (1, 54)
455 NAME 'not' (1, 55) (1, 58)
456 NAME 'in' (1, 59) (1, 61)
457 NUMBER '1' (1, 62) (1, 63)
458 NAME 'is' (1, 64) (1, 66)
459 NUMBER '1' (1, 67) (1, 68)
460 NAME 'or' (1, 69) (1, 71)
461 NUMBER '5' (1, 72) (1, 73)
462 NAME 'is' (1, 74) (1, 76)
463 NAME 'not' (1, 77) (1, 80)
464 NUMBER '1' (1, 81) (1, 82)
465 OP ':' (1, 82) (1, 83)
466 NAME 'pass' (1, 84) (1, 88)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300467 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000468
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300469 def test_shift(self):
470 # Shift
471 self.check_tokenize("x = 1 << 1 >> 5", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000472 NAME 'x' (1, 0) (1, 1)
473 OP '=' (1, 2) (1, 3)
474 NUMBER '1' (1, 4) (1, 5)
475 OP '<<' (1, 6) (1, 8)
476 NUMBER '1' (1, 9) (1, 10)
477 OP '>>' (1, 11) (1, 13)
478 NUMBER '5' (1, 14) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300479 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000480
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300481 def test_additive(self):
482 # Additive
483 self.check_tokenize("x = 1 - y + 15 - 1 + 0x124 + z + a[5]", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000484 NAME 'x' (1, 0) (1, 1)
485 OP '=' (1, 2) (1, 3)
486 NUMBER '1' (1, 4) (1, 5)
487 OP '-' (1, 6) (1, 7)
488 NAME 'y' (1, 8) (1, 9)
489 OP '+' (1, 10) (1, 11)
490 NUMBER '15' (1, 12) (1, 14)
491 OP '-' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000492 NUMBER '1' (1, 17) (1, 18)
493 OP '+' (1, 19) (1, 20)
494 NUMBER '0x124' (1, 21) (1, 26)
495 OP '+' (1, 27) (1, 28)
496 NAME 'z' (1, 29) (1, 30)
497 OP '+' (1, 31) (1, 32)
498 NAME 'a' (1, 33) (1, 34)
499 OP '[' (1, 34) (1, 35)
500 NUMBER '5' (1, 35) (1, 36)
501 OP ']' (1, 36) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300502 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000503
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300504 def test_multiplicative(self):
505 # Multiplicative
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300506 self.check_tokenize("x = 1//1*1/5*12%0x12@42", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000507 NAME 'x' (1, 0) (1, 1)
508 OP '=' (1, 2) (1, 3)
509 NUMBER '1' (1, 4) (1, 5)
510 OP '//' (1, 5) (1, 7)
511 NUMBER '1' (1, 7) (1, 8)
512 OP '*' (1, 8) (1, 9)
513 NUMBER '1' (1, 9) (1, 10)
514 OP '/' (1, 10) (1, 11)
515 NUMBER '5' (1, 11) (1, 12)
516 OP '*' (1, 12) (1, 13)
517 NUMBER '12' (1, 13) (1, 15)
518 OP '%' (1, 15) (1, 16)
519 NUMBER '0x12' (1, 16) (1, 20)
Benjamin Petersond51374e2014-04-09 23:55:56 -0400520 OP '@' (1, 20) (1, 21)
521 NUMBER '42' (1, 21) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300522 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000523
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300524 def test_unary(self):
525 # Unary
526 self.check_tokenize("~1 ^ 1 & 1 |1 ^ -1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000527 OP '~' (1, 0) (1, 1)
528 NUMBER '1' (1, 1) (1, 2)
529 OP '^' (1, 3) (1, 4)
530 NUMBER '1' (1, 5) (1, 6)
531 OP '&' (1, 7) (1, 8)
532 NUMBER '1' (1, 9) (1, 10)
533 OP '|' (1, 11) (1, 12)
534 NUMBER '1' (1, 12) (1, 13)
535 OP '^' (1, 14) (1, 15)
536 OP '-' (1, 16) (1, 17)
537 NUMBER '1' (1, 17) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300538 """)
539 self.check_tokenize("-1*1/1+1*1//1 - ---1**1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000540 OP '-' (1, 0) (1, 1)
541 NUMBER '1' (1, 1) (1, 2)
542 OP '*' (1, 2) (1, 3)
543 NUMBER '1' (1, 3) (1, 4)
544 OP '/' (1, 4) (1, 5)
545 NUMBER '1' (1, 5) (1, 6)
546 OP '+' (1, 6) (1, 7)
547 NUMBER '1' (1, 7) (1, 8)
548 OP '*' (1, 8) (1, 9)
549 NUMBER '1' (1, 9) (1, 10)
550 OP '//' (1, 10) (1, 12)
551 NUMBER '1' (1, 12) (1, 13)
552 OP '-' (1, 14) (1, 15)
553 OP '-' (1, 16) (1, 17)
554 OP '-' (1, 17) (1, 18)
555 OP '-' (1, 18) (1, 19)
556 NUMBER '1' (1, 19) (1, 20)
557 OP '**' (1, 20) (1, 22)
558 NUMBER '1' (1, 22) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300559 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000560
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300561 def test_selector(self):
562 # Selector
563 self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000564 NAME 'import' (1, 0) (1, 6)
565 NAME 'sys' (1, 7) (1, 10)
566 OP ',' (1, 10) (1, 11)
567 NAME 'time' (1, 12) (1, 16)
568 NEWLINE '\\n' (1, 16) (1, 17)
569 NAME 'x' (2, 0) (2, 1)
570 OP '=' (2, 2) (2, 3)
571 NAME 'sys' (2, 4) (2, 7)
572 OP '.' (2, 7) (2, 8)
573 NAME 'modules' (2, 8) (2, 15)
574 OP '[' (2, 15) (2, 16)
575 STRING "'time'" (2, 16) (2, 22)
576 OP ']' (2, 22) (2, 23)
577 OP '.' (2, 23) (2, 24)
578 NAME 'time' (2, 24) (2, 28)
579 OP '(' (2, 28) (2, 29)
580 OP ')' (2, 29) (2, 30)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300581 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000582
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300583 def test_method(self):
584 # Methods
585 self.check_tokenize("@staticmethod\ndef foo(x,y): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000586 OP '@' (1, 0) (1, 1)
Eric V. Smith67317742015-10-16 20:45:53 -0400587 NAME 'staticmethod' (1, 1) (1, 13)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000588 NEWLINE '\\n' (1, 13) (1, 14)
589 NAME 'def' (2, 0) (2, 3)
590 NAME 'foo' (2, 4) (2, 7)
591 OP '(' (2, 7) (2, 8)
592 NAME 'x' (2, 8) (2, 9)
593 OP ',' (2, 9) (2, 10)
594 NAME 'y' (2, 10) (2, 11)
595 OP ')' (2, 11) (2, 12)
596 OP ':' (2, 12) (2, 13)
597 NAME 'pass' (2, 14) (2, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300598 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000599
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300600 def test_tabs(self):
601 # Evil tabs
602 self.check_tokenize("def f():\n"
603 "\tif x\n"
604 " \tpass", """\
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000605 NAME 'def' (1, 0) (1, 3)
606 NAME 'f' (1, 4) (1, 5)
607 OP '(' (1, 5) (1, 6)
608 OP ')' (1, 6) (1, 7)
609 OP ':' (1, 7) (1, 8)
610 NEWLINE '\\n' (1, 8) (1, 9)
611 INDENT '\\t' (2, 0) (2, 1)
612 NAME 'if' (2, 1) (2, 3)
613 NAME 'x' (2, 4) (2, 5)
614 NEWLINE '\\n' (2, 5) (2, 6)
615 INDENT ' \\t' (3, 0) (3, 9)
616 NAME 'pass' (3, 9) (3, 13)
617 DEDENT '' (4, 0) (4, 0)
618 DEDENT '' (4, 0) (4, 0)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300619 """)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000620
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300621 def test_non_ascii_identifiers(self):
622 # Non-ascii identifiers
623 self.check_tokenize("Örter = 'places'\ngrün = 'green'", """\
Benjamin Peterson33856de2010-08-30 14:41:20 +0000624 NAME 'Örter' (1, 0) (1, 5)
625 OP '=' (1, 6) (1, 7)
626 STRING "'places'" (1, 8) (1, 16)
627 NEWLINE '\\n' (1, 16) (1, 17)
628 NAME 'grün' (2, 0) (2, 4)
629 OP '=' (2, 5) (2, 6)
630 STRING "'green'" (2, 7) (2, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300631 """)
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000632
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300633 def test_unicode(self):
634 # Legacy unicode literals:
635 self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000636 NAME 'Örter' (1, 0) (1, 5)
637 OP '=' (1, 6) (1, 7)
638 STRING "u'places'" (1, 8) (1, 17)
639 NEWLINE '\\n' (1, 17) (1, 18)
640 NAME 'grün' (2, 0) (2, 4)
641 OP '=' (2, 5) (2, 6)
Christian Heimes0b3847d2012-06-20 11:17:58 +0200642 STRING "U'green'" (2, 7) (2, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300643 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400644
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300645 def test_async(self):
646 # Async/await extension:
647 self.check_tokenize("async = 1", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400648 NAME 'async' (1, 0) (1, 5)
649 OP '=' (1, 6) (1, 7)
650 NUMBER '1' (1, 8) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300651 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400652
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300653 self.check_tokenize("a = (async = 1)", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400654 NAME 'a' (1, 0) (1, 1)
655 OP '=' (1, 2) (1, 3)
656 OP '(' (1, 4) (1, 5)
657 NAME 'async' (1, 5) (1, 10)
658 OP '=' (1, 11) (1, 12)
659 NUMBER '1' (1, 13) (1, 14)
660 OP ')' (1, 14) (1, 15)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300661 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400662
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300663 self.check_tokenize("async()", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400664 NAME 'async' (1, 0) (1, 5)
665 OP '(' (1, 5) (1, 6)
666 OP ')' (1, 6) (1, 7)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300667 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400668
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300669 self.check_tokenize("class async(Bar):pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400670 NAME 'class' (1, 0) (1, 5)
671 NAME 'async' (1, 6) (1, 11)
672 OP '(' (1, 11) (1, 12)
673 NAME 'Bar' (1, 12) (1, 15)
674 OP ')' (1, 15) (1, 16)
675 OP ':' (1, 16) (1, 17)
676 NAME 'pass' (1, 17) (1, 21)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300677 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400678
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300679 self.check_tokenize("class async:pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400680 NAME 'class' (1, 0) (1, 5)
681 NAME 'async' (1, 6) (1, 11)
682 OP ':' (1, 11) (1, 12)
683 NAME 'pass' (1, 12) (1, 16)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300684 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400685
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300686 self.check_tokenize("await = 1", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400687 NAME 'await' (1, 0) (1, 5)
688 OP '=' (1, 6) (1, 7)
689 NUMBER '1' (1, 8) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300690 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400691
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300692 self.check_tokenize("foo.async", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400693 NAME 'foo' (1, 0) (1, 3)
694 OP '.' (1, 3) (1, 4)
695 NAME 'async' (1, 4) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300696 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400697
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300698 self.check_tokenize("async for a in b: pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400699 NAME 'async' (1, 0) (1, 5)
700 NAME 'for' (1, 6) (1, 9)
701 NAME 'a' (1, 10) (1, 11)
702 NAME 'in' (1, 12) (1, 14)
703 NAME 'b' (1, 15) (1, 16)
704 OP ':' (1, 16) (1, 17)
705 NAME 'pass' (1, 18) (1, 22)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300706 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400707
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300708 self.check_tokenize("async with a as b: pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400709 NAME 'async' (1, 0) (1, 5)
710 NAME 'with' (1, 6) (1, 10)
711 NAME 'a' (1, 11) (1, 12)
712 NAME 'as' (1, 13) (1, 15)
713 NAME 'b' (1, 16) (1, 17)
714 OP ':' (1, 17) (1, 18)
715 NAME 'pass' (1, 19) (1, 23)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300716 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400717
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300718 self.check_tokenize("async.foo", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400719 NAME 'async' (1, 0) (1, 5)
720 OP '.' (1, 5) (1, 6)
721 NAME 'foo' (1, 6) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300722 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400723
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300724 self.check_tokenize("async", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400725 NAME 'async' (1, 0) (1, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300726 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400727
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300728 self.check_tokenize("async\n#comment\nawait", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400729 NAME 'async' (1, 0) (1, 5)
730 NEWLINE '\\n' (1, 5) (1, 6)
731 COMMENT '#comment' (2, 0) (2, 8)
732 NL '\\n' (2, 8) (2, 9)
733 NAME 'await' (3, 0) (3, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300734 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400735
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300736 self.check_tokenize("async\n...\nawait", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400737 NAME 'async' (1, 0) (1, 5)
738 NEWLINE '\\n' (1, 5) (1, 6)
739 OP '...' (2, 0) (2, 3)
740 NEWLINE '\\n' (2, 3) (2, 4)
741 NAME 'await' (3, 0) (3, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300742 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400743
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300744 self.check_tokenize("async\nawait", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400745 NAME 'async' (1, 0) (1, 5)
746 NEWLINE '\\n' (1, 5) (1, 6)
747 NAME 'await' (2, 0) (2, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300748 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400749
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300750 self.check_tokenize("foo.async + 1", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400751 NAME 'foo' (1, 0) (1, 3)
752 OP '.' (1, 3) (1, 4)
753 NAME 'async' (1, 4) (1, 9)
754 OP '+' (1, 10) (1, 11)
755 NUMBER '1' (1, 12) (1, 13)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300756 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400757
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300758 self.check_tokenize("async def foo(): pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400759 ASYNC 'async' (1, 0) (1, 5)
760 NAME 'def' (1, 6) (1, 9)
761 NAME 'foo' (1, 10) (1, 13)
762 OP '(' (1, 13) (1, 14)
763 OP ')' (1, 14) (1, 15)
764 OP ':' (1, 15) (1, 16)
765 NAME 'pass' (1, 17) (1, 21)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300766 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400767
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300768 self.check_tokenize('''\
769async def foo():
770 def foo(await):
771 await = 1
772 if 1:
773 await
774async += 1
775''', """\
Yury Selivanov75445082015-05-11 22:57:16 -0400776 ASYNC 'async' (1, 0) (1, 5)
777 NAME 'def' (1, 6) (1, 9)
778 NAME 'foo' (1, 10) (1, 13)
779 OP '(' (1, 13) (1, 14)
780 OP ')' (1, 14) (1, 15)
781 OP ':' (1, 15) (1, 16)
782 NEWLINE '\\n' (1, 16) (1, 17)
783 INDENT ' ' (2, 0) (2, 2)
784 NAME 'def' (2, 2) (2, 5)
785 NAME 'foo' (2, 6) (2, 9)
786 OP '(' (2, 9) (2, 10)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300787 AWAIT 'await' (2, 10) (2, 15)
Yury Selivanov75445082015-05-11 22:57:16 -0400788 OP ')' (2, 15) (2, 16)
789 OP ':' (2, 16) (2, 17)
790 NEWLINE '\\n' (2, 17) (2, 18)
791 INDENT ' ' (3, 0) (3, 4)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300792 AWAIT 'await' (3, 4) (3, 9)
Yury Selivanov75445082015-05-11 22:57:16 -0400793 OP '=' (3, 10) (3, 11)
794 NUMBER '1' (3, 12) (3, 13)
795 NEWLINE '\\n' (3, 13) (3, 14)
796 DEDENT '' (4, 2) (4, 2)
797 NAME 'if' (4, 2) (4, 4)
798 NUMBER '1' (4, 5) (4, 6)
799 OP ':' (4, 6) (4, 7)
800 NEWLINE '\\n' (4, 7) (4, 8)
801 INDENT ' ' (5, 0) (5, 4)
802 AWAIT 'await' (5, 4) (5, 9)
803 NEWLINE '\\n' (5, 9) (5, 10)
804 DEDENT '' (6, 0) (6, 0)
805 DEDENT '' (6, 0) (6, 0)
806 NAME 'async' (6, 0) (6, 5)
807 OP '+=' (6, 6) (6, 8)
808 NUMBER '1' (6, 9) (6, 10)
809 NEWLINE '\\n' (6, 10) (6, 11)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300810 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400811
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300812 self.check_tokenize('''\
813async def foo():
814 async for i in 1: pass''', """\
Yury Selivanov75445082015-05-11 22:57:16 -0400815 ASYNC 'async' (1, 0) (1, 5)
816 NAME 'def' (1, 6) (1, 9)
817 NAME 'foo' (1, 10) (1, 13)
818 OP '(' (1, 13) (1, 14)
819 OP ')' (1, 14) (1, 15)
820 OP ':' (1, 15) (1, 16)
821 NEWLINE '\\n' (1, 16) (1, 17)
822 INDENT ' ' (2, 0) (2, 2)
823 ASYNC 'async' (2, 2) (2, 7)
824 NAME 'for' (2, 8) (2, 11)
825 NAME 'i' (2, 12) (2, 13)
826 NAME 'in' (2, 14) (2, 16)
827 NUMBER '1' (2, 17) (2, 18)
828 OP ':' (2, 18) (2, 19)
829 NAME 'pass' (2, 20) (2, 24)
830 DEDENT '' (3, 0) (3, 0)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300831 """)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300832
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300833 self.check_tokenize('''async def foo(async): await''', """\
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300834 ASYNC 'async' (1, 0) (1, 5)
835 NAME 'def' (1, 6) (1, 9)
836 NAME 'foo' (1, 10) (1, 13)
837 OP '(' (1, 13) (1, 14)
838 ASYNC 'async' (1, 14) (1, 19)
839 OP ')' (1, 19) (1, 20)
840 OP ':' (1, 20) (1, 21)
841 AWAIT 'await' (1, 22) (1, 27)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300842 """)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300843
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300844 self.check_tokenize('''\
845def f():
846
847 def baz(): pass
848 async def bar(): pass
849
850 await = 2''', """\
Yury Selivanov96ec9342015-07-23 15:01:58 +0300851 NAME 'def' (1, 0) (1, 3)
852 NAME 'f' (1, 4) (1, 5)
853 OP '(' (1, 5) (1, 6)
854 OP ')' (1, 6) (1, 7)
855 OP ':' (1, 7) (1, 8)
856 NEWLINE '\\n' (1, 8) (1, 9)
857 NL '\\n' (2, 0) (2, 1)
858 INDENT ' ' (3, 0) (3, 2)
859 NAME 'def' (3, 2) (3, 5)
860 NAME 'baz' (3, 6) (3, 9)
861 OP '(' (3, 9) (3, 10)
862 OP ')' (3, 10) (3, 11)
863 OP ':' (3, 11) (3, 12)
864 NAME 'pass' (3, 13) (3, 17)
865 NEWLINE '\\n' (3, 17) (3, 18)
866 ASYNC 'async' (4, 2) (4, 7)
867 NAME 'def' (4, 8) (4, 11)
868 NAME 'bar' (4, 12) (4, 15)
869 OP '(' (4, 15) (4, 16)
870 OP ')' (4, 16) (4, 17)
871 OP ':' (4, 17) (4, 18)
872 NAME 'pass' (4, 19) (4, 23)
873 NEWLINE '\\n' (4, 23) (4, 24)
874 NL '\\n' (5, 0) (5, 1)
875 NAME 'await' (6, 2) (6, 7)
876 OP '=' (6, 8) (6, 9)
877 NUMBER '2' (6, 10) (6, 11)
878 DEDENT '' (7, 0) (7, 0)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300879 """)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300880
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300881 self.check_tokenize('''\
882async def f():
883
884 def baz(): pass
885 async def bar(): pass
886
887 await = 2''', """\
Yury Selivanov96ec9342015-07-23 15:01:58 +0300888 ASYNC 'async' (1, 0) (1, 5)
889 NAME 'def' (1, 6) (1, 9)
890 NAME 'f' (1, 10) (1, 11)
891 OP '(' (1, 11) (1, 12)
892 OP ')' (1, 12) (1, 13)
893 OP ':' (1, 13) (1, 14)
894 NEWLINE '\\n' (1, 14) (1, 15)
895 NL '\\n' (2, 0) (2, 1)
896 INDENT ' ' (3, 0) (3, 2)
897 NAME 'def' (3, 2) (3, 5)
898 NAME 'baz' (3, 6) (3, 9)
899 OP '(' (3, 9) (3, 10)
900 OP ')' (3, 10) (3, 11)
901 OP ':' (3, 11) (3, 12)
902 NAME 'pass' (3, 13) (3, 17)
903 NEWLINE '\\n' (3, 17) (3, 18)
904 ASYNC 'async' (4, 2) (4, 7)
905 NAME 'def' (4, 8) (4, 11)
906 NAME 'bar' (4, 12) (4, 15)
907 OP '(' (4, 15) (4, 16)
908 OP ')' (4, 16) (4, 17)
909 OP ':' (4, 17) (4, 18)
910 NAME 'pass' (4, 19) (4, 23)
911 NEWLINE '\\n' (4, 23) (4, 24)
912 NL '\\n' (5, 0) (5, 1)
913 AWAIT 'await' (6, 2) (6, 7)
914 OP '=' (6, 8) (6, 9)
915 NUMBER '2' (6, 10) (6, 11)
916 DEDENT '' (7, 0) (7, 0)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300917 """)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000918
Raymond Hettinger68c04532005-06-10 11:05:19 +0000919
Raymond Hettinger68c04532005-06-10 11:05:19 +0000920def decistmt(s):
Raymond Hettinger68c04532005-06-10 11:05:19 +0000921 result = []
Trent Nelson428de652008-03-18 22:41:35 +0000922 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
Raymond Hettinger68c04532005-06-10 11:05:19 +0000923 for toknum, tokval, _, _, _ in g:
924 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
925 result.extend([
926 (NAME, 'Decimal'),
927 (OP, '('),
928 (STRING, repr(tokval)),
929 (OP, ')')
930 ])
931 else:
932 result.append((toknum, tokval))
Trent Nelson428de652008-03-18 22:41:35 +0000933 return untokenize(result).decode('utf-8')
934
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300935class TestMisc(TestCase):
936
937 def test_decistmt(self):
938 # Substitute Decimals for floats in a string of statements.
939 # This is an example from the docs.
940
941 from decimal import Decimal
942 s = '+21.3e-5*-.1234/81.7'
943 self.assertEqual(decistmt(s),
944 "+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')")
945
946 # The format of the exponent is inherited from the platform C library.
947 # Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
948 # we're only showing 11 digits, and the 12th isn't close to 5, the
949 # rest of the output should be platform-independent.
950 self.assertRegex(repr(eval(s)), '-3.2171603427[0-9]*e-0+7')
951
952 # Output from calculations with Decimal should be identical across all
953 # platforms.
954 self.assertEqual(eval(decistmt(s)),
955 Decimal('-3.217160342717258261933904529E-7'))
956
Trent Nelson428de652008-03-18 22:41:35 +0000957
958class TestTokenizerAdheresToPep0263(TestCase):
959 """
960 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
961 """
962
963 def _testFile(self, filename):
964 path = os.path.join(os.path.dirname(__file__), filename)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300965 TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
Trent Nelson428de652008-03-18 22:41:35 +0000966
967 def test_utf8_coding_cookie_and_no_utf8_bom(self):
Ned Deily2ea6fcc2011-07-19 16:15:27 -0700968 f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300969 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +0000970
971 def test_latin1_coding_cookie_and_utf8_bom(self):
972 """
973 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
974 allowed encoding for the comment is 'utf-8'. The text file used in
975 this test starts with a BOM signature, but specifies latin1 as the
976 coding, so verify that a SyntaxError is raised, which matches the
977 behaviour of the interpreter when it encounters a similar condition.
978 """
979 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000980 self.assertRaises(SyntaxError, self._testFile, f)
Trent Nelson428de652008-03-18 22:41:35 +0000981
982 def test_no_coding_cookie_and_utf8_bom(self):
983 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300984 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +0000985
986 def test_utf8_coding_cookie_and_utf8_bom(self):
987 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300988 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +0000989
Florent Xicluna11f0b412012-07-07 12:13:35 +0200990 def test_bad_coding_cookie(self):
991 self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')
992 self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')
993
Trent Nelson428de652008-03-18 22:41:35 +0000994
995class Test_Tokenize(TestCase):
996
997 def test__tokenize_decodes_with_specified_encoding(self):
998 literal = '"ЉЊЈЁЂ"'
999 line = literal.encode('utf-8')
1000 first = False
1001 def readline():
1002 nonlocal first
1003 if not first:
1004 first = True
1005 return line
1006 else:
1007 return b''
1008
1009 # skip the initial encoding token and the end token
1010 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
1011 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +00001012 self.assertEqual(tokens, expected_tokens,
1013 "bytes not decoded with encoding")
Trent Nelson428de652008-03-18 22:41:35 +00001014
1015 def test__tokenize_does_not_decode_with_encoding_none(self):
1016 literal = '"ЉЊЈЁЂ"'
1017 first = False
1018 def readline():
1019 nonlocal first
1020 if not first:
1021 first = True
1022 return literal
1023 else:
1024 return b''
1025
1026 # skip the end token
1027 tokens = list(_tokenize(readline, encoding=None))[:-1]
1028 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +00001029 self.assertEqual(tokens, expected_tokens,
1030 "string not tokenized when encoding is None")
Trent Nelson428de652008-03-18 22:41:35 +00001031
1032
1033class TestDetectEncoding(TestCase):
1034
1035 def get_readline(self, lines):
1036 index = 0
1037 def readline():
1038 nonlocal index
1039 if index == len(lines):
1040 raise StopIteration
1041 line = lines[index]
1042 index += 1
1043 return line
1044 return readline
1045
1046 def test_no_bom_no_encoding_cookie(self):
1047 lines = (
1048 b'# something\n',
1049 b'print(something)\n',
1050 b'do_something(else)\n'
1051 )
1052 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001053 self.assertEqual(encoding, 'utf-8')
1054 self.assertEqual(consumed_lines, list(lines[:2]))
Trent Nelson428de652008-03-18 22:41:35 +00001055
1056 def test_bom_no_cookie(self):
1057 lines = (
1058 b'\xef\xbb\xbf# something\n',
1059 b'print(something)\n',
1060 b'do_something(else)\n'
1061 )
1062 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001063 self.assertEqual(encoding, 'utf-8-sig')
1064 self.assertEqual(consumed_lines,
1065 [b'# something\n', b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001066
1067 def test_cookie_first_line_no_bom(self):
1068 lines = (
1069 b'# -*- coding: latin-1 -*-\n',
1070 b'print(something)\n',
1071 b'do_something(else)\n'
1072 )
1073 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001074 self.assertEqual(encoding, 'iso-8859-1')
1075 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001076
1077 def test_matched_bom_and_cookie_first_line(self):
1078 lines = (
1079 b'\xef\xbb\xbf# coding=utf-8\n',
1080 b'print(something)\n',
1081 b'do_something(else)\n'
1082 )
1083 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001084 self.assertEqual(encoding, 'utf-8-sig')
1085 self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001086
1087 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
1088 lines = (
1089 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
1090 b'print(something)\n',
1091 b'do_something(else)\n'
1092 )
1093 readline = self.get_readline(lines)
1094 self.assertRaises(SyntaxError, detect_encoding, readline)
1095
1096 def test_cookie_second_line_no_bom(self):
1097 lines = (
1098 b'#! something\n',
1099 b'# vim: set fileencoding=ascii :\n',
1100 b'print(something)\n',
1101 b'do_something(else)\n'
1102 )
1103 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001104 self.assertEqual(encoding, 'ascii')
Trent Nelson428de652008-03-18 22:41:35 +00001105 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
Ezio Melottib3aedd42010-11-20 19:04:17 +00001106 self.assertEqual(consumed_lines, expected)
Trent Nelson428de652008-03-18 22:41:35 +00001107
1108 def test_matched_bom_and_cookie_second_line(self):
1109 lines = (
1110 b'\xef\xbb\xbf#! something\n',
1111 b'f# coding=utf-8\n',
1112 b'print(something)\n',
1113 b'do_something(else)\n'
1114 )
1115 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001116 self.assertEqual(encoding, 'utf-8-sig')
1117 self.assertEqual(consumed_lines,
1118 [b'#! something\n', b'f# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001119
1120 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
1121 lines = (
1122 b'\xef\xbb\xbf#! something\n',
1123 b'# vim: set fileencoding=ascii :\n',
1124 b'print(something)\n',
1125 b'do_something(else)\n'
1126 )
1127 readline = self.get_readline(lines)
1128 self.assertRaises(SyntaxError, detect_encoding, readline)
1129
Serhiy Storchaka768c16c2014-01-09 18:36:09 +02001130 def test_cookie_second_line_noncommented_first_line(self):
1131 lines = (
1132 b"print('\xc2\xa3')\n",
1133 b'# vim: set fileencoding=iso8859-15 :\n',
1134 b"print('\xe2\x82\xac')\n"
1135 )
1136 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1137 self.assertEqual(encoding, 'utf-8')
1138 expected = [b"print('\xc2\xa3')\n"]
1139 self.assertEqual(consumed_lines, expected)
1140
1141 def test_cookie_second_line_commented_first_line(self):
1142 lines = (
1143 b"#print('\xc2\xa3')\n",
1144 b'# vim: set fileencoding=iso8859-15 :\n',
1145 b"print('\xe2\x82\xac')\n"
1146 )
1147 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1148 self.assertEqual(encoding, 'iso8859-15')
1149 expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
1150 self.assertEqual(consumed_lines, expected)
1151
1152 def test_cookie_second_line_empty_first_line(self):
1153 lines = (
1154 b'\n',
1155 b'# vim: set fileencoding=iso8859-15 :\n',
1156 b"print('\xe2\x82\xac')\n"
1157 )
1158 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1159 self.assertEqual(encoding, 'iso8859-15')
1160 expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
1161 self.assertEqual(consumed_lines, expected)
1162
Benjamin Petersond3afada2009-10-09 21:43:09 +00001163 def test_latin1_normalization(self):
1164 # See get_normal_name() in tokenizer.c.
1165 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
1166 "iso-8859-1-unix", "iso-latin-1-mac")
1167 for encoding in encodings:
1168 for rep in ("-", "_"):
1169 enc = encoding.replace("-", rep)
1170 lines = (b"#!/usr/bin/python\n",
1171 b"# coding: " + enc.encode("ascii") + b"\n",
1172 b"print(things)\n",
1173 b"do_something += 4\n")
1174 rl = self.get_readline(lines)
1175 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001176 self.assertEqual(found, "iso-8859-1")
Benjamin Petersond3afada2009-10-09 21:43:09 +00001177
Martin v. Löwis63674f42012-04-20 14:36:47 +02001178 def test_syntaxerror_latin1(self):
1179 # Issue 14629: need to raise SyntaxError if the first
1180 # line(s) have non-UTF-8 characters
1181 lines = (
1182 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1183 )
1184 readline = self.get_readline(lines)
1185 self.assertRaises(SyntaxError, detect_encoding, readline)
1186
1187
Benjamin Petersond3afada2009-10-09 21:43:09 +00001188 def test_utf8_normalization(self):
1189 # See get_normal_name() in tokenizer.c.
1190 encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
1191 for encoding in encodings:
1192 for rep in ("-", "_"):
1193 enc = encoding.replace("-", rep)
1194 lines = (b"#!/usr/bin/python\n",
1195 b"# coding: " + enc.encode("ascii") + b"\n",
1196 b"1 + 3\n")
1197 rl = self.get_readline(lines)
1198 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001199 self.assertEqual(found, "utf-8")
Benjamin Petersond3afada2009-10-09 21:43:09 +00001200
Trent Nelson428de652008-03-18 22:41:35 +00001201 def test_short_files(self):
1202 readline = self.get_readline((b'print(something)\n',))
1203 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001204 self.assertEqual(encoding, 'utf-8')
1205 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001206
1207 encoding, consumed_lines = detect_encoding(self.get_readline(()))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001208 self.assertEqual(encoding, 'utf-8')
1209 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +00001210
1211 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
1212 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001213 self.assertEqual(encoding, 'utf-8-sig')
1214 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001215
1216 readline = self.get_readline((b'\xef\xbb\xbf',))
1217 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001218 self.assertEqual(encoding, 'utf-8-sig')
1219 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +00001220
Benjamin Peterson433f32c2008-12-12 01:25:05 +00001221 readline = self.get_readline((b'# coding: bad\n',))
1222 self.assertRaises(SyntaxError, detect_encoding, readline)
Trent Nelson428de652008-03-18 22:41:35 +00001223
Serhiy Storchakadafea852013-09-16 23:51:56 +03001224 def test_false_encoding(self):
1225 # Issue 18873: "Encoding" detected in non-comment lines
1226 readline = self.get_readline((b'print("#coding=fake")',))
1227 encoding, consumed_lines = detect_encoding(readline)
1228 self.assertEqual(encoding, 'utf-8')
1229 self.assertEqual(consumed_lines, [b'print("#coding=fake")'])
1230
Victor Stinner58c07522010-11-09 01:08:59 +00001231 def test_open(self):
1232 filename = support.TESTFN + '.py'
1233 self.addCleanup(support.unlink, filename)
1234
1235 # test coding cookie
1236 for encoding in ('iso-8859-15', 'utf-8'):
1237 with open(filename, 'w', encoding=encoding) as fp:
1238 print("# coding: %s" % encoding, file=fp)
1239 print("print('euro:\u20ac')", file=fp)
1240 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001241 self.assertEqual(fp.encoding, encoding)
1242 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001243
1244 # test BOM (no coding cookie)
1245 with open(filename, 'w', encoding='utf-8-sig') as fp:
1246 print("print('euro:\u20ac')", file=fp)
1247 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001248 self.assertEqual(fp.encoding, 'utf-8-sig')
1249 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001250
Brett Cannonc33f3f22012-04-20 13:23:54 -04001251 def test_filename_in_exception(self):
1252 # When possible, include the file name in the exception.
1253 path = 'some_file_path'
1254 lines = (
1255 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1256 )
1257 class Bunk:
1258 def __init__(self, lines, path):
1259 self.name = path
1260 self._lines = lines
1261 self._index = 0
1262
1263 def readline(self):
1264 if self._index == len(lines):
1265 raise StopIteration
1266 line = lines[self._index]
1267 self._index += 1
1268 return line
1269
1270 with self.assertRaises(SyntaxError):
1271 ins = Bunk(lines, path)
1272 # Make sure lacking a name isn't an issue.
1273 del ins.name
1274 detect_encoding(ins.readline)
1275 with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
1276 ins = Bunk(lines, path)
1277 detect_encoding(ins.readline)
1278
Victor Stinner387729e2015-05-26 00:43:58 +02001279 def test_open_error(self):
1280 # Issue #23840: open() must close the binary file on error
1281 m = BytesIO(b'#coding:xxx')
1282 with mock.patch('tokenize._builtin_open', return_value=m):
1283 self.assertRaises(SyntaxError, tokenize_open, 'foobar')
1284 self.assertTrue(m.closed)
1285
1286
Trent Nelson428de652008-03-18 22:41:35 +00001287class TestTokenize(TestCase):
1288
1289 def test_tokenize(self):
1290 import tokenize as tokenize_module
1291 encoding = object()
1292 encoding_used = None
1293 def mock_detect_encoding(readline):
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001294 return encoding, [b'first', b'second']
Trent Nelson428de652008-03-18 22:41:35 +00001295
1296 def mock__tokenize(readline, encoding):
1297 nonlocal encoding_used
1298 encoding_used = encoding
1299 out = []
1300 while True:
1301 next_line = readline()
1302 if next_line:
1303 out.append(next_line)
1304 continue
1305 return out
1306
1307 counter = 0
1308 def mock_readline():
1309 nonlocal counter
1310 counter += 1
1311 if counter == 5:
1312 return b''
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001313 return str(counter).encode()
Trent Nelson428de652008-03-18 22:41:35 +00001314
1315 orig_detect_encoding = tokenize_module.detect_encoding
1316 orig__tokenize = tokenize_module._tokenize
1317 tokenize_module.detect_encoding = mock_detect_encoding
1318 tokenize_module._tokenize = mock__tokenize
1319 try:
1320 results = tokenize(mock_readline)
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001321 self.assertEqual(list(results),
1322 [b'first', b'second', b'1', b'2', b'3', b'4'])
Trent Nelson428de652008-03-18 22:41:35 +00001323 finally:
1324 tokenize_module.detect_encoding = orig_detect_encoding
1325 tokenize_module._tokenize = orig__tokenize
1326
1327 self.assertTrue(encoding_used, encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +00001328
Yury Selivanov8085b802015-05-18 12:50:52 -04001329 def test_oneline_defs(self):
1330 buf = []
1331 for i in range(500):
1332 buf.append('def i{i}(): return {i}'.format(i=i))
1333 buf.append('OK')
1334 buf = '\n'.join(buf)
1335
1336 # Test that 500 consequent, one-line defs is OK
1337 toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
1338 self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER
1339
Meador Inge00c7f852012-01-19 00:44:45 -06001340 def assertExactTypeEqual(self, opstr, *optypes):
1341 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
1342 num_optypes = len(optypes)
1343 self.assertEqual(len(tokens), 2 + num_optypes)
1344 self.assertEqual(token.tok_name[tokens[0].exact_type],
1345 token.tok_name[ENCODING])
1346 for i in range(num_optypes):
1347 self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
1348 token.tok_name[optypes[i]])
1349 self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
1350 token.tok_name[token.ENDMARKER])
1351
1352 def test_exact_type(self):
1353 self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
1354 self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
1355 self.assertExactTypeEqual(':', token.COLON)
1356 self.assertExactTypeEqual(',', token.COMMA)
1357 self.assertExactTypeEqual(';', token.SEMI)
1358 self.assertExactTypeEqual('+', token.PLUS)
1359 self.assertExactTypeEqual('-', token.MINUS)
1360 self.assertExactTypeEqual('*', token.STAR)
1361 self.assertExactTypeEqual('/', token.SLASH)
1362 self.assertExactTypeEqual('|', token.VBAR)
1363 self.assertExactTypeEqual('&', token.AMPER)
1364 self.assertExactTypeEqual('<', token.LESS)
1365 self.assertExactTypeEqual('>', token.GREATER)
1366 self.assertExactTypeEqual('=', token.EQUAL)
1367 self.assertExactTypeEqual('.', token.DOT)
1368 self.assertExactTypeEqual('%', token.PERCENT)
1369 self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
1370 self.assertExactTypeEqual('==', token.EQEQUAL)
1371 self.assertExactTypeEqual('!=', token.NOTEQUAL)
1372 self.assertExactTypeEqual('<=', token.LESSEQUAL)
1373 self.assertExactTypeEqual('>=', token.GREATEREQUAL)
1374 self.assertExactTypeEqual('~', token.TILDE)
1375 self.assertExactTypeEqual('^', token.CIRCUMFLEX)
1376 self.assertExactTypeEqual('<<', token.LEFTSHIFT)
1377 self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
1378 self.assertExactTypeEqual('**', token.DOUBLESTAR)
1379 self.assertExactTypeEqual('+=', token.PLUSEQUAL)
1380 self.assertExactTypeEqual('-=', token.MINEQUAL)
1381 self.assertExactTypeEqual('*=', token.STAREQUAL)
1382 self.assertExactTypeEqual('/=', token.SLASHEQUAL)
1383 self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
1384 self.assertExactTypeEqual('&=', token.AMPEREQUAL)
1385 self.assertExactTypeEqual('|=', token.VBAREQUAL)
1386 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1387 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1388 self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
1389 self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
1390 self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
1391 self.assertExactTypeEqual('//', token.DOUBLESLASH)
1392 self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
Jim Fasarakis-Hilliardd4914e92017-03-14 22:16:15 +02001393 self.assertExactTypeEqual('...', token.ELLIPSIS)
1394 self.assertExactTypeEqual('->', token.RARROW)
Meador Inge00c7f852012-01-19 00:44:45 -06001395 self.assertExactTypeEqual('@', token.AT)
Benjamin Petersond51374e2014-04-09 23:55:56 -04001396 self.assertExactTypeEqual('@=', token.ATEQUAL)
Meador Inge00c7f852012-01-19 00:44:45 -06001397
1398 self.assertExactTypeEqual('a**2+b**2==c**2',
1399 NAME, token.DOUBLESTAR, NUMBER,
1400 token.PLUS,
1401 NAME, token.DOUBLESTAR, NUMBER,
1402 token.EQEQUAL,
1403 NAME, token.DOUBLESTAR, NUMBER)
1404 self.assertExactTypeEqual('{1, 2, 3}',
1405 token.LBRACE,
1406 token.NUMBER, token.COMMA,
1407 token.NUMBER, token.COMMA,
1408 token.NUMBER,
1409 token.RBRACE)
1410 self.assertExactTypeEqual('^(x & 0x1)',
1411 token.CIRCUMFLEX,
1412 token.LPAR,
1413 token.NAME, token.AMPER, token.NUMBER,
1414 token.RPAR)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001415
Ezio Melottifafa8b72012-11-03 17:46:51 +02001416 def test_pathological_trailing_whitespace(self):
1417 # See http://bugs.python.org/issue16152
1418 self.assertExactTypeEqual('@ ', token.AT)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001419
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001420
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001421class UntokenizeTest(TestCase):
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001422
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001423 def test_bad_input_order(self):
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001424 # raise if previous row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001425 u = Untokenizer()
1426 u.prev_row = 2
1427 u.prev_col = 2
1428 with self.assertRaises(ValueError) as cm:
1429 u.add_whitespace((1,3))
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001430 self.assertEqual(cm.exception.args[0],
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001431 'start (1,3) precedes previous end (2,2)')
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001432 # raise if previous column in row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001433 self.assertRaises(ValueError, u.add_whitespace, (2,1))
1434
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001435 def test_backslash_continuation(self):
1436 # The problem is that <whitespace>\<newline> leaves no token
1437 u = Untokenizer()
1438 u.prev_row = 1
1439 u.prev_col = 1
1440 u.tokens = []
1441 u.add_whitespace((2, 0))
1442 self.assertEqual(u.tokens, ['\\\n'])
1443 u.prev_row = 2
1444 u.add_whitespace((4, 4))
1445 self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' '])
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001446 TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n')
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001447
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -05001448 def test_iter_compat(self):
1449 u = Untokenizer()
1450 token = (NAME, 'Hello')
1451 tokens = [(ENCODING, 'utf-8'), token]
1452 u.compat(token, iter([]))
1453 self.assertEqual(u.tokens, ["Hello "])
1454 u = Untokenizer()
1455 self.assertEqual(u.untokenize(iter([token])), 'Hello ')
1456 u = Untokenizer()
1457 self.assertEqual(u.untokenize(iter(tokens)), 'Hello ')
1458 self.assertEqual(u.encoding, 'utf-8')
1459 self.assertEqual(untokenize(iter(tokens)), b'Hello ')
1460
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001461
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001462class TestRoundtrip(TestCase):
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001463
1464 def check_roundtrip(self, f):
1465 """
1466 Test roundtrip for `untokenize`. `f` is an open file or a string.
1467 The source code in f is tokenized to both 5- and 2-tuples.
1468 Both sequences are converted back to source code via
1469 tokenize.untokenize(), and the latter tokenized again to 2-tuples.
1470 The test fails if the 3 pair tokenizations do not match.
1471
1472 When untokenize bugs are fixed, untokenize with 5-tuples should
1473 reproduce code that does not contain a backslash continuation
1474 following spaces. A proper test should test this.
1475 """
1476 # Get source code and original tokenizations
1477 if isinstance(f, str):
1478 code = f.encode('utf-8')
1479 else:
1480 code = f.read()
1481 f.close()
1482 readline = iter(code.splitlines(keepends=True)).__next__
1483 tokens5 = list(tokenize(readline))
1484 tokens2 = [tok[:2] for tok in tokens5]
1485 # Reproduce tokens2 from pairs
1486 bytes_from2 = untokenize(tokens2)
1487 readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__
1488 tokens2_from2 = [tok[:2] for tok in tokenize(readline2)]
1489 self.assertEqual(tokens2_from2, tokens2)
1490 # Reproduce tokens2 from 5-tuples
1491 bytes_from5 = untokenize(tokens5)
1492 readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__
1493 tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
1494 self.assertEqual(tokens2_from5, tokens2)
1495
1496 def test_roundtrip(self):
1497 # There are some standard formatting practices that are easy to get right.
1498
1499 self.check_roundtrip("if x == 1:\n"
1500 " print(x)\n")
1501 self.check_roundtrip("# This is a comment\n"
1502 "# This also")
1503
1504 # Some people use different formatting conventions, which makes
1505 # untokenize a little trickier. Note that this test involves trailing
1506 # whitespace after the colon. Note that we use hex escapes to make the
1507 # two trailing blanks apparent in the expected output.
1508
1509 self.check_roundtrip("if x == 1 : \n"
1510 " print(x)\n")
1511 fn = support.findfile("tokenize_tests.txt")
1512 with open(fn, 'rb') as f:
1513 self.check_roundtrip(f)
1514 self.check_roundtrip("if x == 1:\n"
1515 " # A comment by itself.\n"
1516 " print(x) # Comment here, too.\n"
1517 " # Another comment.\n"
1518 "after_if = True\n")
1519 self.check_roundtrip("if (x # The comments need to go in the right place\n"
1520 " == 1):\n"
1521 " print('x==1')\n")
1522 self.check_roundtrip("class Test: # A comment here\n"
1523 " # A comment with weird indent\n"
1524 " after_com = 5\n"
1525 " def x(m): return m*5 # a one liner\n"
1526 " def y(m): # A whitespace after the colon\n"
1527 " return y*4 # 3-space indent\n")
1528
1529 # Some error-handling code
1530 self.check_roundtrip("try: import somemodule\n"
1531 "except ImportError: # comment\n"
1532 " print('Can not import' # comment2\n)"
1533 "else: print('Loaded')\n")
1534
1535 def test_continuation(self):
1536 # Balancing continuation
1537 self.check_roundtrip("a = (3,4, \n"
1538 "5,6)\n"
1539 "y = [3, 4,\n"
1540 "5]\n"
1541 "z = {'a': 5,\n"
1542 "'b':15, 'c':True}\n"
1543 "x = len(y) + 5 - a[\n"
1544 "3] - a[2]\n"
1545 "+ len(z) - z[\n"
1546 "'b']\n")
1547
1548 def test_backslash_continuation(self):
1549 # Backslash means line continuation, except for comments
1550 self.check_roundtrip("x=1+\\\n"
1551 "1\n"
1552 "# This is a comment\\\n"
1553 "# This also\n")
1554 self.check_roundtrip("# Comment \\\n"
1555 "x = 0")
1556
1557 def test_string_concatenation(self):
1558 # Two string literals on the same line
1559 self.check_roundtrip("'' ''")
1560
1561 def test_random_files(self):
1562 # Test roundtrip on random python modules.
1563 # pass the '-ucpu' option to process the full directory.
1564
1565 import glob, random
1566 fn = support.findfile("tokenize_tests.txt")
1567 tempdir = os.path.dirname(fn) or os.curdir
1568 testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
1569
Brett Cannona721aba2016-09-09 14:57:09 -07001570 # Tokenize is broken on test_pep3131.py because regular expressions are
1571 # broken on the obscure unicode identifiers in it. *sigh*
1572 # With roundtrip extended to test the 5-tuple mode of untokenize,
1573 # 7 more testfiles fail. Remove them also until the failure is diagnosed.
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001574
Zachary Ware724f6a62016-09-09 12:55:37 -07001575 testfiles.remove(os.path.join(tempdir, "test_unicode_identifiers.py"))
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001576 for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'):
1577 testfiles.remove(os.path.join(tempdir, "test_%s.py") % f)
1578
1579 if not support.is_resource_enabled("cpu"):
1580 testfiles = random.sample(testfiles, 10)
1581
1582 for testfile in testfiles:
1583 with open(testfile, 'rb') as f:
1584 with self.subTest(file=testfile):
1585 self.check_roundtrip(f)
1586
1587
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001588 def roundtrip(self, code):
1589 if isinstance(code, str):
1590 code = code.encode('utf-8')
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001591 return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001592
1593 def test_indentation_semantics_retained(self):
1594 """
1595 Ensure that although whitespace might be mutated in a roundtrip,
1596 the semantic meaning of the indentation remains consistent.
1597 """
1598 code = "if False:\n\tx=3\n\tx=3\n"
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001599 codelines = self.roundtrip(code).split('\n')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001600 self.assertEqual(codelines[1], codelines[2])
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001601 self.check_roundtrip(code)
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001602
1603
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001604if __name__ == "__main__":
Brett Cannona721aba2016-09-09 14:57:09 -07001605 unittest.main()