blob: 3b17ca6329d59610f9bbd93811efdee02b28907e [file] [log] [blame]
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001from test import support
2from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
3 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
4 open as tokenize_open, Untokenizer)
5from io import BytesIO
6from unittest import TestCase, mock
7import os
8import token
Thomas Wouters89f507f2006-12-13 04:49:30 +00009
Thomas Wouters89f507f2006-12-13 04:49:30 +000010
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030011class TokenizeTest(TestCase):
12 # Tests for the tokenize module.
Jason R. Coombs7cf36382015-06-20 19:13:50 -040013
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030014 # The tests can be really simple. Given a small fragment of source
15 # code, print out a table with tokens. The ENDMARKER is omitted for
16 # brevity.
17
18 def check_tokenize(self, s, expected):
19 # Format the tokens in s in a table format.
20 # The ENDMARKER is omitted.
21 result = []
22 f = BytesIO(s.encode('utf-8'))
23 for type, token, start, end, line in tokenize(f.readline):
24 if type == ENDMARKER:
25 break
26 type = tok_name[type]
27 result.append(" %(type)-10.10s %(token)-13.13r %(start)s %(end)s" %
28 locals())
29 self.assertEqual(result,
30 [" ENCODING 'utf-8' (0, 0) (0, 0)"] +
31 expected.rstrip().splitlines())
32
33 def test_basic(self):
34 self.check_tokenize("1 + 1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000035 NUMBER '1' (1, 0) (1, 1)
36 OP '+' (1, 2) (1, 3)
37 NUMBER '1' (1, 4) (1, 5)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030038 """)
39 self.check_tokenize("if False:\n"
40 " # NL\n"
41 " True = False # NEWLINE\n", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000042 NAME 'if' (1, 0) (1, 2)
43 NAME 'False' (1, 3) (1, 8)
44 OP ':' (1, 8) (1, 9)
45 NEWLINE '\\n' (1, 9) (1, 10)
46 COMMENT '# NL' (2, 4) (2, 8)
47 NL '\\n' (2, 8) (2, 9)
48 INDENT ' ' (3, 0) (3, 4)
49 NAME 'True' (3, 4) (3, 8)
50 OP '=' (3, 9) (3, 10)
51 NAME 'False' (3, 11) (3, 16)
52 COMMENT '# NEWLINE' (3, 17) (3, 26)
53 NEWLINE '\\n' (3, 26) (3, 27)
54 DEDENT '' (4, 0) (4, 0)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030055 """)
56 indent_error_file = b"""\
57def k(x):
58 x += 2
59 x += 5
60"""
61 readline = BytesIO(indent_error_file).readline
62 with self.assertRaisesRegex(IndentationError,
63 "unindent does not match any "
64 "outer indentation level"):
65 for tok in tokenize(readline):
66 pass
Thomas Wouters89f507f2006-12-13 04:49:30 +000067
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030068 def test_int(self):
69 # Ordinary integers and binary operators
70 self.check_tokenize("0xff <= 255", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000071 NUMBER '0xff' (1, 0) (1, 4)
72 OP '<=' (1, 5) (1, 7)
73 NUMBER '255' (1, 8) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030074 """)
75 self.check_tokenize("0b10 <= 255", """\
Eric Smith74ca5572008-03-17 19:49:19 +000076 NUMBER '0b10' (1, 0) (1, 4)
77 OP '<=' (1, 5) (1, 7)
78 NUMBER '255' (1, 8) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030079 """)
80 self.check_tokenize("0o123 <= 0O123", """\
Eric Smith74ca5572008-03-17 19:49:19 +000081 NUMBER '0o123' (1, 0) (1, 5)
82 OP '<=' (1, 6) (1, 8)
83 NUMBER '0O123' (1, 9) (1, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030084 """)
85 self.check_tokenize("1234567 > ~0x15", """\
Mark Dickinson0c1f7c02008-03-16 05:05:12 +000086 NUMBER '1234567' (1, 0) (1, 7)
87 OP '>' (1, 8) (1, 9)
88 OP '~' (1, 10) (1, 11)
89 NUMBER '0x15' (1, 11) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030090 """)
91 self.check_tokenize("2134568 != 1231515", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000092 NUMBER '2134568' (1, 0) (1, 7)
93 OP '!=' (1, 8) (1, 10)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +000094 NUMBER '1231515' (1, 11) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030095 """)
96 self.check_tokenize("(-124561-1) & 200000000", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000097 OP '(' (1, 0) (1, 1)
98 OP '-' (1, 1) (1, 2)
99 NUMBER '124561' (1, 2) (1, 8)
100 OP '-' (1, 8) (1, 9)
101 NUMBER '1' (1, 9) (1, 10)
102 OP ')' (1, 10) (1, 11)
103 OP '&' (1, 12) (1, 13)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000104 NUMBER '200000000' (1, 14) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300105 """)
106 self.check_tokenize("0xdeadbeef != -1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000107 NUMBER '0xdeadbeef' (1, 0) (1, 10)
108 OP '!=' (1, 11) (1, 13)
109 OP '-' (1, 14) (1, 15)
110 NUMBER '1' (1, 15) (1, 16)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300111 """)
112 self.check_tokenize("0xdeadc0de & 12345", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000113 NUMBER '0xdeadc0de' (1, 0) (1, 10)
114 OP '&' (1, 11) (1, 12)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000115 NUMBER '12345' (1, 13) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300116 """)
117 self.check_tokenize("0xFF & 0x15 | 1234", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000118 NUMBER '0xFF' (1, 0) (1, 4)
119 OP '&' (1, 5) (1, 6)
120 NUMBER '0x15' (1, 7) (1, 11)
121 OP '|' (1, 12) (1, 13)
122 NUMBER '1234' (1, 14) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300123 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000124
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300125 def test_long(self):
126 # Long integers
127 self.check_tokenize("x = 0", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000128 NAME 'x' (1, 0) (1, 1)
129 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000130 NUMBER '0' (1, 4) (1, 5)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300131 """)
132 self.check_tokenize("x = 0xfffffffffff", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000133 NAME 'x' (1, 0) (1, 1)
134 OP '=' (1, 2) (1, 3)
135 NUMBER '0xffffffffff (1, 4) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300136 """)
137 self.check_tokenize("x = 123141242151251616110", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000138 NAME 'x' (1, 0) (1, 1)
139 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000140 NUMBER '123141242151 (1, 4) (1, 25)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300141 """)
142 self.check_tokenize("x = -15921590215012591", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000143 NAME 'x' (1, 0) (1, 1)
144 OP '=' (1, 2) (1, 3)
145 OP '-' (1, 4) (1, 5)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000146 NUMBER '159215902150 (1, 5) (1, 22)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300147 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000148
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300149 def test_float(self):
150 # Floating point numbers
151 self.check_tokenize("x = 3.14159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000152 NAME 'x' (1, 0) (1, 1)
153 OP '=' (1, 2) (1, 3)
154 NUMBER '3.14159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300155 """)
156 self.check_tokenize("x = 314159.", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000157 NAME 'x' (1, 0) (1, 1)
158 OP '=' (1, 2) (1, 3)
159 NUMBER '314159.' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300160 """)
161 self.check_tokenize("x = .314159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000162 NAME 'x' (1, 0) (1, 1)
163 OP '=' (1, 2) (1, 3)
164 NUMBER '.314159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300165 """)
166 self.check_tokenize("x = 3e14159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000167 NAME 'x' (1, 0) (1, 1)
168 OP '=' (1, 2) (1, 3)
169 NUMBER '3e14159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300170 """)
171 self.check_tokenize("x = 3E123", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000172 NAME 'x' (1, 0) (1, 1)
173 OP '=' (1, 2) (1, 3)
174 NUMBER '3E123' (1, 4) (1, 9)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300175 """)
176 self.check_tokenize("x+y = 3e-1230", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000177 NAME 'x' (1, 0) (1, 1)
178 OP '+' (1, 1) (1, 2)
179 NAME 'y' (1, 2) (1, 3)
180 OP '=' (1, 4) (1, 5)
181 NUMBER '3e-1230' (1, 6) (1, 13)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300182 """)
183 self.check_tokenize("x = 3.14e159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000184 NAME 'x' (1, 0) (1, 1)
185 OP '=' (1, 2) (1, 3)
186 NUMBER '3.14e159' (1, 4) (1, 12)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300187 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000188
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300189 def test_string(self):
190 # String literals
191 self.check_tokenize("x = ''; y = \"\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000192 NAME 'x' (1, 0) (1, 1)
193 OP '=' (1, 2) (1, 3)
194 STRING "''" (1, 4) (1, 6)
195 OP ';' (1, 6) (1, 7)
196 NAME 'y' (1, 8) (1, 9)
197 OP '=' (1, 10) (1, 11)
198 STRING '""' (1, 12) (1, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300199 """)
200 self.check_tokenize("x = '\"'; y = \"'\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000201 NAME 'x' (1, 0) (1, 1)
202 OP '=' (1, 2) (1, 3)
203 STRING '\\'"\\'' (1, 4) (1, 7)
204 OP ';' (1, 7) (1, 8)
205 NAME 'y' (1, 9) (1, 10)
206 OP '=' (1, 11) (1, 12)
207 STRING '"\\'"' (1, 13) (1, 16)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300208 """)
209 self.check_tokenize("x = \"doesn't \"shrink\", does it\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000210 NAME 'x' (1, 0) (1, 1)
211 OP '=' (1, 2) (1, 3)
212 STRING '"doesn\\'t "' (1, 4) (1, 14)
213 NAME 'shrink' (1, 14) (1, 20)
214 STRING '", does it"' (1, 20) (1, 31)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300215 """)
216 self.check_tokenize("x = 'abc' + 'ABC'", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000217 NAME 'x' (1, 0) (1, 1)
218 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000219 STRING "'abc'" (1, 4) (1, 9)
220 OP '+' (1, 10) (1, 11)
221 STRING "'ABC'" (1, 12) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300222 """)
223 self.check_tokenize('y = "ABC" + "ABC"', """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000224 NAME 'y' (1, 0) (1, 1)
225 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000226 STRING '"ABC"' (1, 4) (1, 9)
227 OP '+' (1, 10) (1, 11)
228 STRING '"ABC"' (1, 12) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300229 """)
230 self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000231 NAME 'x' (1, 0) (1, 1)
232 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000233 STRING "r'abc'" (1, 4) (1, 10)
234 OP '+' (1, 11) (1, 12)
235 STRING "r'ABC'" (1, 13) (1, 19)
236 OP '+' (1, 20) (1, 21)
237 STRING "R'ABC'" (1, 22) (1, 28)
238 OP '+' (1, 29) (1, 30)
239 STRING "R'ABC'" (1, 31) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300240 """)
241 self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000242 NAME 'y' (1, 0) (1, 1)
243 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000244 STRING 'r"abc"' (1, 4) (1, 10)
245 OP '+' (1, 11) (1, 12)
246 STRING 'r"ABC"' (1, 13) (1, 19)
247 OP '+' (1, 20) (1, 21)
248 STRING 'R"ABC"' (1, 22) (1, 28)
249 OP '+' (1, 29) (1, 30)
250 STRING 'R"ABC"' (1, 31) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300251 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000252
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300253 self.check_tokenize("u'abc' + U'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500254 STRING "u'abc'" (1, 0) (1, 6)
255 OP '+' (1, 7) (1, 8)
256 STRING "U'abc'" (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300257 """)
258 self.check_tokenize('u"abc" + U"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500259 STRING 'u"abc"' (1, 0) (1, 6)
260 OP '+' (1, 7) (1, 8)
261 STRING 'U"abc"' (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300262 """)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500263
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300264 self.check_tokenize("b'abc' + B'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500265 STRING "b'abc'" (1, 0) (1, 6)
266 OP '+' (1, 7) (1, 8)
267 STRING "B'abc'" (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300268 """)
269 self.check_tokenize('b"abc" + B"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500270 STRING 'b"abc"' (1, 0) (1, 6)
271 OP '+' (1, 7) (1, 8)
272 STRING 'B"abc"' (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300273 """)
274 self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500275 STRING "br'abc'" (1, 0) (1, 7)
276 OP '+' (1, 8) (1, 9)
277 STRING "bR'abc'" (1, 10) (1, 17)
278 OP '+' (1, 18) (1, 19)
279 STRING "Br'abc'" (1, 20) (1, 27)
280 OP '+' (1, 28) (1, 29)
281 STRING "BR'abc'" (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300282 """)
283 self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500284 STRING 'br"abc"' (1, 0) (1, 7)
285 OP '+' (1, 8) (1, 9)
286 STRING 'bR"abc"' (1, 10) (1, 17)
287 OP '+' (1, 18) (1, 19)
288 STRING 'Br"abc"' (1, 20) (1, 27)
289 OP '+' (1, 28) (1, 29)
290 STRING 'BR"abc"' (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300291 """)
292 self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500293 STRING "rb'abc'" (1, 0) (1, 7)
294 OP '+' (1, 8) (1, 9)
295 STRING "rB'abc'" (1, 10) (1, 17)
296 OP '+' (1, 18) (1, 19)
297 STRING "Rb'abc'" (1, 20) (1, 27)
298 OP '+' (1, 28) (1, 29)
299 STRING "RB'abc'" (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300300 """)
301 self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500302 STRING 'rb"abc"' (1, 0) (1, 7)
303 OP '+' (1, 8) (1, 9)
304 STRING 'rB"abc"' (1, 10) (1, 17)
305 OP '+' (1, 18) (1, 19)
306 STRING 'Rb"abc"' (1, 20) (1, 27)
307 OP '+' (1, 28) (1, 29)
308 STRING 'RB"abc"' (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300309 """)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500310
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300311 def test_function(self):
312 self.check_tokenize("def d22(a, b, c=2, d=2, *k): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000313 NAME 'def' (1, 0) (1, 3)
314 NAME 'd22' (1, 4) (1, 7)
315 OP '(' (1, 7) (1, 8)
316 NAME 'a' (1, 8) (1, 9)
317 OP ',' (1, 9) (1, 10)
318 NAME 'b' (1, 11) (1, 12)
319 OP ',' (1, 12) (1, 13)
320 NAME 'c' (1, 14) (1, 15)
321 OP '=' (1, 15) (1, 16)
322 NUMBER '2' (1, 16) (1, 17)
323 OP ',' (1, 17) (1, 18)
324 NAME 'd' (1, 19) (1, 20)
325 OP '=' (1, 20) (1, 21)
326 NUMBER '2' (1, 21) (1, 22)
327 OP ',' (1, 22) (1, 23)
328 OP '*' (1, 24) (1, 25)
329 NAME 'k' (1, 25) (1, 26)
330 OP ')' (1, 26) (1, 27)
331 OP ':' (1, 27) (1, 28)
332 NAME 'pass' (1, 29) (1, 33)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300333 """)
334 self.check_tokenize("def d01v_(a=1, *k, **w): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000335 NAME 'def' (1, 0) (1, 3)
336 NAME 'd01v_' (1, 4) (1, 9)
337 OP '(' (1, 9) (1, 10)
338 NAME 'a' (1, 10) (1, 11)
339 OP '=' (1, 11) (1, 12)
340 NUMBER '1' (1, 12) (1, 13)
341 OP ',' (1, 13) (1, 14)
342 OP '*' (1, 15) (1, 16)
343 NAME 'k' (1, 16) (1, 17)
344 OP ',' (1, 17) (1, 18)
345 OP '**' (1, 19) (1, 21)
346 NAME 'w' (1, 21) (1, 22)
347 OP ')' (1, 22) (1, 23)
348 OP ':' (1, 23) (1, 24)
349 NAME 'pass' (1, 25) (1, 29)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300350 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000351
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300352 def test_comparison(self):
353 # Comparison
354 self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != "
355 "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000356 NAME 'if' (1, 0) (1, 2)
357 NUMBER '1' (1, 3) (1, 4)
358 OP '<' (1, 5) (1, 6)
359 NUMBER '1' (1, 7) (1, 8)
360 OP '>' (1, 9) (1, 10)
361 NUMBER '1' (1, 11) (1, 12)
362 OP '==' (1, 13) (1, 15)
363 NUMBER '1' (1, 16) (1, 17)
364 OP '>=' (1, 18) (1, 20)
365 NUMBER '5' (1, 21) (1, 22)
366 OP '<=' (1, 23) (1, 25)
367 NUMBER '0x15' (1, 26) (1, 30)
368 OP '<=' (1, 31) (1, 33)
369 NUMBER '0x12' (1, 34) (1, 38)
370 OP '!=' (1, 39) (1, 41)
371 NUMBER '1' (1, 42) (1, 43)
372 NAME 'and' (1, 44) (1, 47)
373 NUMBER '5' (1, 48) (1, 49)
374 NAME 'in' (1, 50) (1, 52)
375 NUMBER '1' (1, 53) (1, 54)
376 NAME 'not' (1, 55) (1, 58)
377 NAME 'in' (1, 59) (1, 61)
378 NUMBER '1' (1, 62) (1, 63)
379 NAME 'is' (1, 64) (1, 66)
380 NUMBER '1' (1, 67) (1, 68)
381 NAME 'or' (1, 69) (1, 71)
382 NUMBER '5' (1, 72) (1, 73)
383 NAME 'is' (1, 74) (1, 76)
384 NAME 'not' (1, 77) (1, 80)
385 NUMBER '1' (1, 81) (1, 82)
386 OP ':' (1, 82) (1, 83)
387 NAME 'pass' (1, 84) (1, 88)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300388 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000389
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300390 def test_shift(self):
391 # Shift
392 self.check_tokenize("x = 1 << 1 >> 5", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000393 NAME 'x' (1, 0) (1, 1)
394 OP '=' (1, 2) (1, 3)
395 NUMBER '1' (1, 4) (1, 5)
396 OP '<<' (1, 6) (1, 8)
397 NUMBER '1' (1, 9) (1, 10)
398 OP '>>' (1, 11) (1, 13)
399 NUMBER '5' (1, 14) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300400 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000401
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300402 def test_additive(self):
403 # Additive
404 self.check_tokenize("x = 1 - y + 15 - 1 + 0x124 + z + a[5]", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000405 NAME 'x' (1, 0) (1, 1)
406 OP '=' (1, 2) (1, 3)
407 NUMBER '1' (1, 4) (1, 5)
408 OP '-' (1, 6) (1, 7)
409 NAME 'y' (1, 8) (1, 9)
410 OP '+' (1, 10) (1, 11)
411 NUMBER '15' (1, 12) (1, 14)
412 OP '-' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000413 NUMBER '1' (1, 17) (1, 18)
414 OP '+' (1, 19) (1, 20)
415 NUMBER '0x124' (1, 21) (1, 26)
416 OP '+' (1, 27) (1, 28)
417 NAME 'z' (1, 29) (1, 30)
418 OP '+' (1, 31) (1, 32)
419 NAME 'a' (1, 33) (1, 34)
420 OP '[' (1, 34) (1, 35)
421 NUMBER '5' (1, 35) (1, 36)
422 OP ']' (1, 36) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300423 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000424
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300425 def test_multiplicative(self):
426 # Multiplicative
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300427 self.check_tokenize("x = 1//1*1/5*12%0x12@42", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000428 NAME 'x' (1, 0) (1, 1)
429 OP '=' (1, 2) (1, 3)
430 NUMBER '1' (1, 4) (1, 5)
431 OP '//' (1, 5) (1, 7)
432 NUMBER '1' (1, 7) (1, 8)
433 OP '*' (1, 8) (1, 9)
434 NUMBER '1' (1, 9) (1, 10)
435 OP '/' (1, 10) (1, 11)
436 NUMBER '5' (1, 11) (1, 12)
437 OP '*' (1, 12) (1, 13)
438 NUMBER '12' (1, 13) (1, 15)
439 OP '%' (1, 15) (1, 16)
440 NUMBER '0x12' (1, 16) (1, 20)
Benjamin Petersond51374e2014-04-09 23:55:56 -0400441 OP '@' (1, 20) (1, 21)
442 NUMBER '42' (1, 21) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300443 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000444
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300445 def test_unary(self):
446 # Unary
447 self.check_tokenize("~1 ^ 1 & 1 |1 ^ -1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000448 OP '~' (1, 0) (1, 1)
449 NUMBER '1' (1, 1) (1, 2)
450 OP '^' (1, 3) (1, 4)
451 NUMBER '1' (1, 5) (1, 6)
452 OP '&' (1, 7) (1, 8)
453 NUMBER '1' (1, 9) (1, 10)
454 OP '|' (1, 11) (1, 12)
455 NUMBER '1' (1, 12) (1, 13)
456 OP '^' (1, 14) (1, 15)
457 OP '-' (1, 16) (1, 17)
458 NUMBER '1' (1, 17) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300459 """)
460 self.check_tokenize("-1*1/1+1*1//1 - ---1**1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000461 OP '-' (1, 0) (1, 1)
462 NUMBER '1' (1, 1) (1, 2)
463 OP '*' (1, 2) (1, 3)
464 NUMBER '1' (1, 3) (1, 4)
465 OP '/' (1, 4) (1, 5)
466 NUMBER '1' (1, 5) (1, 6)
467 OP '+' (1, 6) (1, 7)
468 NUMBER '1' (1, 7) (1, 8)
469 OP '*' (1, 8) (1, 9)
470 NUMBER '1' (1, 9) (1, 10)
471 OP '//' (1, 10) (1, 12)
472 NUMBER '1' (1, 12) (1, 13)
473 OP '-' (1, 14) (1, 15)
474 OP '-' (1, 16) (1, 17)
475 OP '-' (1, 17) (1, 18)
476 OP '-' (1, 18) (1, 19)
477 NUMBER '1' (1, 19) (1, 20)
478 OP '**' (1, 20) (1, 22)
479 NUMBER '1' (1, 22) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300480 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000481
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300482 def test_selector(self):
483 # Selector
484 self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000485 NAME 'import' (1, 0) (1, 6)
486 NAME 'sys' (1, 7) (1, 10)
487 OP ',' (1, 10) (1, 11)
488 NAME 'time' (1, 12) (1, 16)
489 NEWLINE '\\n' (1, 16) (1, 17)
490 NAME 'x' (2, 0) (2, 1)
491 OP '=' (2, 2) (2, 3)
492 NAME 'sys' (2, 4) (2, 7)
493 OP '.' (2, 7) (2, 8)
494 NAME 'modules' (2, 8) (2, 15)
495 OP '[' (2, 15) (2, 16)
496 STRING "'time'" (2, 16) (2, 22)
497 OP ']' (2, 22) (2, 23)
498 OP '.' (2, 23) (2, 24)
499 NAME 'time' (2, 24) (2, 28)
500 OP '(' (2, 28) (2, 29)
501 OP ')' (2, 29) (2, 30)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300502 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000503
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300504 def test_method(self):
505 # Methods
506 self.check_tokenize("@staticmethod\ndef foo(x,y): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000507 OP '@' (1, 0) (1, 1)
508 NAME 'staticmethod (1, 1) (1, 13)
509 NEWLINE '\\n' (1, 13) (1, 14)
510 NAME 'def' (2, 0) (2, 3)
511 NAME 'foo' (2, 4) (2, 7)
512 OP '(' (2, 7) (2, 8)
513 NAME 'x' (2, 8) (2, 9)
514 OP ',' (2, 9) (2, 10)
515 NAME 'y' (2, 10) (2, 11)
516 OP ')' (2, 11) (2, 12)
517 OP ':' (2, 12) (2, 13)
518 NAME 'pass' (2, 14) (2, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300519 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000520
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300521 def test_tabs(self):
522 # Evil tabs
523 self.check_tokenize("def f():\n"
524 "\tif x\n"
525 " \tpass", """\
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000526 NAME 'def' (1, 0) (1, 3)
527 NAME 'f' (1, 4) (1, 5)
528 OP '(' (1, 5) (1, 6)
529 OP ')' (1, 6) (1, 7)
530 OP ':' (1, 7) (1, 8)
531 NEWLINE '\\n' (1, 8) (1, 9)
532 INDENT '\\t' (2, 0) (2, 1)
533 NAME 'if' (2, 1) (2, 3)
534 NAME 'x' (2, 4) (2, 5)
535 NEWLINE '\\n' (2, 5) (2, 6)
536 INDENT ' \\t' (3, 0) (3, 9)
537 NAME 'pass' (3, 9) (3, 13)
538 DEDENT '' (4, 0) (4, 0)
539 DEDENT '' (4, 0) (4, 0)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300540 """)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000541
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300542 def test_non_ascii_identifiers(self):
543 # Non-ascii identifiers
544 self.check_tokenize("Örter = 'places'\ngrün = 'green'", """\
Benjamin Peterson33856de2010-08-30 14:41:20 +0000545 NAME 'Örter' (1, 0) (1, 5)
546 OP '=' (1, 6) (1, 7)
547 STRING "'places'" (1, 8) (1, 16)
548 NEWLINE '\\n' (1, 16) (1, 17)
549 NAME 'grün' (2, 0) (2, 4)
550 OP '=' (2, 5) (2, 6)
551 STRING "'green'" (2, 7) (2, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300552 """)
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000553
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300554 def test_unicode(self):
555 # Legacy unicode literals:
556 self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000557 NAME 'Örter' (1, 0) (1, 5)
558 OP '=' (1, 6) (1, 7)
559 STRING "u'places'" (1, 8) (1, 17)
560 NEWLINE '\\n' (1, 17) (1, 18)
561 NAME 'grün' (2, 0) (2, 4)
562 OP '=' (2, 5) (2, 6)
Christian Heimes0b3847d2012-06-20 11:17:58 +0200563 STRING "U'green'" (2, 7) (2, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300564 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400565
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300566 def test_async(self):
567 # Async/await extension:
568 self.check_tokenize("async = 1", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400569 NAME 'async' (1, 0) (1, 5)
570 OP '=' (1, 6) (1, 7)
571 NUMBER '1' (1, 8) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300572 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400573
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300574 self.check_tokenize("a = (async = 1)", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400575 NAME 'a' (1, 0) (1, 1)
576 OP '=' (1, 2) (1, 3)
577 OP '(' (1, 4) (1, 5)
578 NAME 'async' (1, 5) (1, 10)
579 OP '=' (1, 11) (1, 12)
580 NUMBER '1' (1, 13) (1, 14)
581 OP ')' (1, 14) (1, 15)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300582 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400583
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300584 self.check_tokenize("async()", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400585 NAME 'async' (1, 0) (1, 5)
586 OP '(' (1, 5) (1, 6)
587 OP ')' (1, 6) (1, 7)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300588 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400589
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300590 self.check_tokenize("class async(Bar):pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400591 NAME 'class' (1, 0) (1, 5)
592 NAME 'async' (1, 6) (1, 11)
593 OP '(' (1, 11) (1, 12)
594 NAME 'Bar' (1, 12) (1, 15)
595 OP ')' (1, 15) (1, 16)
596 OP ':' (1, 16) (1, 17)
597 NAME 'pass' (1, 17) (1, 21)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300598 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400599
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300600 self.check_tokenize("class async:pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400601 NAME 'class' (1, 0) (1, 5)
602 NAME 'async' (1, 6) (1, 11)
603 OP ':' (1, 11) (1, 12)
604 NAME 'pass' (1, 12) (1, 16)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300605 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400606
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300607 self.check_tokenize("await = 1", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400608 NAME 'await' (1, 0) (1, 5)
609 OP '=' (1, 6) (1, 7)
610 NUMBER '1' (1, 8) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300611 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400612
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300613 self.check_tokenize("foo.async", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400614 NAME 'foo' (1, 0) (1, 3)
615 OP '.' (1, 3) (1, 4)
616 NAME 'async' (1, 4) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300617 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400618
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300619 self.check_tokenize("async for a in b: pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400620 NAME 'async' (1, 0) (1, 5)
621 NAME 'for' (1, 6) (1, 9)
622 NAME 'a' (1, 10) (1, 11)
623 NAME 'in' (1, 12) (1, 14)
624 NAME 'b' (1, 15) (1, 16)
625 OP ':' (1, 16) (1, 17)
626 NAME 'pass' (1, 18) (1, 22)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300627 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400628
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300629 self.check_tokenize("async with a as b: pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400630 NAME 'async' (1, 0) (1, 5)
631 NAME 'with' (1, 6) (1, 10)
632 NAME 'a' (1, 11) (1, 12)
633 NAME 'as' (1, 13) (1, 15)
634 NAME 'b' (1, 16) (1, 17)
635 OP ':' (1, 17) (1, 18)
636 NAME 'pass' (1, 19) (1, 23)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300637 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400638
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300639 self.check_tokenize("async.foo", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400640 NAME 'async' (1, 0) (1, 5)
641 OP '.' (1, 5) (1, 6)
642 NAME 'foo' (1, 6) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300643 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400644
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300645 self.check_tokenize("async", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400646 NAME 'async' (1, 0) (1, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300647 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400648
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300649 self.check_tokenize("async\n#comment\nawait", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400650 NAME 'async' (1, 0) (1, 5)
651 NEWLINE '\\n' (1, 5) (1, 6)
652 COMMENT '#comment' (2, 0) (2, 8)
653 NL '\\n' (2, 8) (2, 9)
654 NAME 'await' (3, 0) (3, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300655 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400656
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300657 self.check_tokenize("async\n...\nawait", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400658 NAME 'async' (1, 0) (1, 5)
659 NEWLINE '\\n' (1, 5) (1, 6)
660 OP '...' (2, 0) (2, 3)
661 NEWLINE '\\n' (2, 3) (2, 4)
662 NAME 'await' (3, 0) (3, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300663 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400664
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300665 self.check_tokenize("async\nawait", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400666 NAME 'async' (1, 0) (1, 5)
667 NEWLINE '\\n' (1, 5) (1, 6)
668 NAME 'await' (2, 0) (2, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300669 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400670
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300671 self.check_tokenize("foo.async + 1", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400672 NAME 'foo' (1, 0) (1, 3)
673 OP '.' (1, 3) (1, 4)
674 NAME 'async' (1, 4) (1, 9)
675 OP '+' (1, 10) (1, 11)
676 NUMBER '1' (1, 12) (1, 13)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300677 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400678
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300679 self.check_tokenize("async def foo(): pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400680 ASYNC 'async' (1, 0) (1, 5)
681 NAME 'def' (1, 6) (1, 9)
682 NAME 'foo' (1, 10) (1, 13)
683 OP '(' (1, 13) (1, 14)
684 OP ')' (1, 14) (1, 15)
685 OP ':' (1, 15) (1, 16)
686 NAME 'pass' (1, 17) (1, 21)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300687 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400688
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300689 self.check_tokenize('''\
690async def foo():
691 def foo(await):
692 await = 1
693 if 1:
694 await
695async += 1
696''', """\
Yury Selivanov75445082015-05-11 22:57:16 -0400697 ASYNC 'async' (1, 0) (1, 5)
698 NAME 'def' (1, 6) (1, 9)
699 NAME 'foo' (1, 10) (1, 13)
700 OP '(' (1, 13) (1, 14)
701 OP ')' (1, 14) (1, 15)
702 OP ':' (1, 15) (1, 16)
703 NEWLINE '\\n' (1, 16) (1, 17)
704 INDENT ' ' (2, 0) (2, 2)
705 NAME 'def' (2, 2) (2, 5)
706 NAME 'foo' (2, 6) (2, 9)
707 OP '(' (2, 9) (2, 10)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300708 AWAIT 'await' (2, 10) (2, 15)
Yury Selivanov75445082015-05-11 22:57:16 -0400709 OP ')' (2, 15) (2, 16)
710 OP ':' (2, 16) (2, 17)
711 NEWLINE '\\n' (2, 17) (2, 18)
712 INDENT ' ' (3, 0) (3, 4)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300713 AWAIT 'await' (3, 4) (3, 9)
Yury Selivanov75445082015-05-11 22:57:16 -0400714 OP '=' (3, 10) (3, 11)
715 NUMBER '1' (3, 12) (3, 13)
716 NEWLINE '\\n' (3, 13) (3, 14)
717 DEDENT '' (4, 2) (4, 2)
718 NAME 'if' (4, 2) (4, 4)
719 NUMBER '1' (4, 5) (4, 6)
720 OP ':' (4, 6) (4, 7)
721 NEWLINE '\\n' (4, 7) (4, 8)
722 INDENT ' ' (5, 0) (5, 4)
723 AWAIT 'await' (5, 4) (5, 9)
724 NEWLINE '\\n' (5, 9) (5, 10)
725 DEDENT '' (6, 0) (6, 0)
726 DEDENT '' (6, 0) (6, 0)
727 NAME 'async' (6, 0) (6, 5)
728 OP '+=' (6, 6) (6, 8)
729 NUMBER '1' (6, 9) (6, 10)
730 NEWLINE '\\n' (6, 10) (6, 11)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300731 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400732
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300733 self.check_tokenize('''\
734async def foo():
735 async for i in 1: pass''', """\
Yury Selivanov75445082015-05-11 22:57:16 -0400736 ASYNC 'async' (1, 0) (1, 5)
737 NAME 'def' (1, 6) (1, 9)
738 NAME 'foo' (1, 10) (1, 13)
739 OP '(' (1, 13) (1, 14)
740 OP ')' (1, 14) (1, 15)
741 OP ':' (1, 15) (1, 16)
742 NEWLINE '\\n' (1, 16) (1, 17)
743 INDENT ' ' (2, 0) (2, 2)
744 ASYNC 'async' (2, 2) (2, 7)
745 NAME 'for' (2, 8) (2, 11)
746 NAME 'i' (2, 12) (2, 13)
747 NAME 'in' (2, 14) (2, 16)
748 NUMBER '1' (2, 17) (2, 18)
749 OP ':' (2, 18) (2, 19)
750 NAME 'pass' (2, 20) (2, 24)
751 DEDENT '' (3, 0) (3, 0)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300752 """)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300753
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300754 self.check_tokenize('''async def foo(async): await''', """\
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300755 ASYNC 'async' (1, 0) (1, 5)
756 NAME 'def' (1, 6) (1, 9)
757 NAME 'foo' (1, 10) (1, 13)
758 OP '(' (1, 13) (1, 14)
759 ASYNC 'async' (1, 14) (1, 19)
760 OP ')' (1, 19) (1, 20)
761 OP ':' (1, 20) (1, 21)
762 AWAIT 'await' (1, 22) (1, 27)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300763 """)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300764
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300765 self.check_tokenize('''\
766def f():
767
768 def baz(): pass
769 async def bar(): pass
770
771 await = 2''', """\
Yury Selivanov96ec9342015-07-23 15:01:58 +0300772 NAME 'def' (1, 0) (1, 3)
773 NAME 'f' (1, 4) (1, 5)
774 OP '(' (1, 5) (1, 6)
775 OP ')' (1, 6) (1, 7)
776 OP ':' (1, 7) (1, 8)
777 NEWLINE '\\n' (1, 8) (1, 9)
778 NL '\\n' (2, 0) (2, 1)
779 INDENT ' ' (3, 0) (3, 2)
780 NAME 'def' (3, 2) (3, 5)
781 NAME 'baz' (3, 6) (3, 9)
782 OP '(' (3, 9) (3, 10)
783 OP ')' (3, 10) (3, 11)
784 OP ':' (3, 11) (3, 12)
785 NAME 'pass' (3, 13) (3, 17)
786 NEWLINE '\\n' (3, 17) (3, 18)
787 ASYNC 'async' (4, 2) (4, 7)
788 NAME 'def' (4, 8) (4, 11)
789 NAME 'bar' (4, 12) (4, 15)
790 OP '(' (4, 15) (4, 16)
791 OP ')' (4, 16) (4, 17)
792 OP ':' (4, 17) (4, 18)
793 NAME 'pass' (4, 19) (4, 23)
794 NEWLINE '\\n' (4, 23) (4, 24)
795 NL '\\n' (5, 0) (5, 1)
796 NAME 'await' (6, 2) (6, 7)
797 OP '=' (6, 8) (6, 9)
798 NUMBER '2' (6, 10) (6, 11)
799 DEDENT '' (7, 0) (7, 0)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300800 """)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300801
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300802 self.check_tokenize('''\
803async def f():
804
805 def baz(): pass
806 async def bar(): pass
807
808 await = 2''', """\
Yury Selivanov96ec9342015-07-23 15:01:58 +0300809 ASYNC 'async' (1, 0) (1, 5)
810 NAME 'def' (1, 6) (1, 9)
811 NAME 'f' (1, 10) (1, 11)
812 OP '(' (1, 11) (1, 12)
813 OP ')' (1, 12) (1, 13)
814 OP ':' (1, 13) (1, 14)
815 NEWLINE '\\n' (1, 14) (1, 15)
816 NL '\\n' (2, 0) (2, 1)
817 INDENT ' ' (3, 0) (3, 2)
818 NAME 'def' (3, 2) (3, 5)
819 NAME 'baz' (3, 6) (3, 9)
820 OP '(' (3, 9) (3, 10)
821 OP ')' (3, 10) (3, 11)
822 OP ':' (3, 11) (3, 12)
823 NAME 'pass' (3, 13) (3, 17)
824 NEWLINE '\\n' (3, 17) (3, 18)
825 ASYNC 'async' (4, 2) (4, 7)
826 NAME 'def' (4, 8) (4, 11)
827 NAME 'bar' (4, 12) (4, 15)
828 OP '(' (4, 15) (4, 16)
829 OP ')' (4, 16) (4, 17)
830 OP ':' (4, 17) (4, 18)
831 NAME 'pass' (4, 19) (4, 23)
832 NEWLINE '\\n' (4, 23) (4, 24)
833 NL '\\n' (5, 0) (5, 1)
834 AWAIT 'await' (6, 2) (6, 7)
835 OP '=' (6, 8) (6, 9)
836 NUMBER '2' (6, 10) (6, 11)
837 DEDENT '' (7, 0) (7, 0)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300838 """)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000839
Raymond Hettinger68c04532005-06-10 11:05:19 +0000840
Raymond Hettinger68c04532005-06-10 11:05:19 +0000841def decistmt(s):
Raymond Hettinger68c04532005-06-10 11:05:19 +0000842 result = []
Trent Nelson428de652008-03-18 22:41:35 +0000843 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
Raymond Hettinger68c04532005-06-10 11:05:19 +0000844 for toknum, tokval, _, _, _ in g:
845 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
846 result.extend([
847 (NAME, 'Decimal'),
848 (OP, '('),
849 (STRING, repr(tokval)),
850 (OP, ')')
851 ])
852 else:
853 result.append((toknum, tokval))
Trent Nelson428de652008-03-18 22:41:35 +0000854 return untokenize(result).decode('utf-8')
855
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300856class TestMisc(TestCase):
857
858 def test_decistmt(self):
859 # Substitute Decimals for floats in a string of statements.
860 # This is an example from the docs.
861
862 from decimal import Decimal
863 s = '+21.3e-5*-.1234/81.7'
864 self.assertEqual(decistmt(s),
865 "+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')")
866
867 # The format of the exponent is inherited from the platform C library.
868 # Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
869 # we're only showing 11 digits, and the 12th isn't close to 5, the
870 # rest of the output should be platform-independent.
871 self.assertRegex(repr(eval(s)), '-3.2171603427[0-9]*e-0+7')
872
873 # Output from calculations with Decimal should be identical across all
874 # platforms.
875 self.assertEqual(eval(decistmt(s)),
876 Decimal('-3.217160342717258261933904529E-7'))
877
Trent Nelson428de652008-03-18 22:41:35 +0000878
879class TestTokenizerAdheresToPep0263(TestCase):
880 """
881 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
882 """
883
884 def _testFile(self, filename):
885 path = os.path.join(os.path.dirname(__file__), filename)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300886 TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
Trent Nelson428de652008-03-18 22:41:35 +0000887
888 def test_utf8_coding_cookie_and_no_utf8_bom(self):
Ned Deily2ea6fcc2011-07-19 16:15:27 -0700889 f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300890 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +0000891
892 def test_latin1_coding_cookie_and_utf8_bom(self):
893 """
894 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
895 allowed encoding for the comment is 'utf-8'. The text file used in
896 this test starts with a BOM signature, but specifies latin1 as the
897 coding, so verify that a SyntaxError is raised, which matches the
898 behaviour of the interpreter when it encounters a similar condition.
899 """
900 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000901 self.assertRaises(SyntaxError, self._testFile, f)
Trent Nelson428de652008-03-18 22:41:35 +0000902
903 def test_no_coding_cookie_and_utf8_bom(self):
904 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300905 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +0000906
907 def test_utf8_coding_cookie_and_utf8_bom(self):
908 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300909 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +0000910
Florent Xicluna11f0b412012-07-07 12:13:35 +0200911 def test_bad_coding_cookie(self):
912 self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')
913 self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')
914
Trent Nelson428de652008-03-18 22:41:35 +0000915
916class Test_Tokenize(TestCase):
917
918 def test__tokenize_decodes_with_specified_encoding(self):
919 literal = '"ЉЊЈЁЂ"'
920 line = literal.encode('utf-8')
921 first = False
922 def readline():
923 nonlocal first
924 if not first:
925 first = True
926 return line
927 else:
928 return b''
929
930 # skip the initial encoding token and the end token
931 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
932 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000933 self.assertEqual(tokens, expected_tokens,
934 "bytes not decoded with encoding")
Trent Nelson428de652008-03-18 22:41:35 +0000935
936 def test__tokenize_does_not_decode_with_encoding_none(self):
937 literal = '"ЉЊЈЁЂ"'
938 first = False
939 def readline():
940 nonlocal first
941 if not first:
942 first = True
943 return literal
944 else:
945 return b''
946
947 # skip the end token
948 tokens = list(_tokenize(readline, encoding=None))[:-1]
949 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000950 self.assertEqual(tokens, expected_tokens,
951 "string not tokenized when encoding is None")
Trent Nelson428de652008-03-18 22:41:35 +0000952
953
954class TestDetectEncoding(TestCase):
955
956 def get_readline(self, lines):
957 index = 0
958 def readline():
959 nonlocal index
960 if index == len(lines):
961 raise StopIteration
962 line = lines[index]
963 index += 1
964 return line
965 return readline
966
967 def test_no_bom_no_encoding_cookie(self):
968 lines = (
969 b'# something\n',
970 b'print(something)\n',
971 b'do_something(else)\n'
972 )
973 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000974 self.assertEqual(encoding, 'utf-8')
975 self.assertEqual(consumed_lines, list(lines[:2]))
Trent Nelson428de652008-03-18 22:41:35 +0000976
977 def test_bom_no_cookie(self):
978 lines = (
979 b'\xef\xbb\xbf# something\n',
980 b'print(something)\n',
981 b'do_something(else)\n'
982 )
983 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000984 self.assertEqual(encoding, 'utf-8-sig')
985 self.assertEqual(consumed_lines,
986 [b'# something\n', b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000987
988 def test_cookie_first_line_no_bom(self):
989 lines = (
990 b'# -*- coding: latin-1 -*-\n',
991 b'print(something)\n',
992 b'do_something(else)\n'
993 )
994 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000995 self.assertEqual(encoding, 'iso-8859-1')
996 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000997
998 def test_matched_bom_and_cookie_first_line(self):
999 lines = (
1000 b'\xef\xbb\xbf# coding=utf-8\n',
1001 b'print(something)\n',
1002 b'do_something(else)\n'
1003 )
1004 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001005 self.assertEqual(encoding, 'utf-8-sig')
1006 self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001007
1008 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
1009 lines = (
1010 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
1011 b'print(something)\n',
1012 b'do_something(else)\n'
1013 )
1014 readline = self.get_readline(lines)
1015 self.assertRaises(SyntaxError, detect_encoding, readline)
1016
1017 def test_cookie_second_line_no_bom(self):
1018 lines = (
1019 b'#! something\n',
1020 b'# vim: set fileencoding=ascii :\n',
1021 b'print(something)\n',
1022 b'do_something(else)\n'
1023 )
1024 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001025 self.assertEqual(encoding, 'ascii')
Trent Nelson428de652008-03-18 22:41:35 +00001026 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
Ezio Melottib3aedd42010-11-20 19:04:17 +00001027 self.assertEqual(consumed_lines, expected)
Trent Nelson428de652008-03-18 22:41:35 +00001028
1029 def test_matched_bom_and_cookie_second_line(self):
1030 lines = (
1031 b'\xef\xbb\xbf#! something\n',
1032 b'f# coding=utf-8\n',
1033 b'print(something)\n',
1034 b'do_something(else)\n'
1035 )
1036 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001037 self.assertEqual(encoding, 'utf-8-sig')
1038 self.assertEqual(consumed_lines,
1039 [b'#! something\n', b'f# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001040
1041 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
1042 lines = (
1043 b'\xef\xbb\xbf#! something\n',
1044 b'# vim: set fileencoding=ascii :\n',
1045 b'print(something)\n',
1046 b'do_something(else)\n'
1047 )
1048 readline = self.get_readline(lines)
1049 self.assertRaises(SyntaxError, detect_encoding, readline)
1050
Serhiy Storchaka768c16c2014-01-09 18:36:09 +02001051 def test_cookie_second_line_noncommented_first_line(self):
1052 lines = (
1053 b"print('\xc2\xa3')\n",
1054 b'# vim: set fileencoding=iso8859-15 :\n',
1055 b"print('\xe2\x82\xac')\n"
1056 )
1057 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1058 self.assertEqual(encoding, 'utf-8')
1059 expected = [b"print('\xc2\xa3')\n"]
1060 self.assertEqual(consumed_lines, expected)
1061
1062 def test_cookie_second_line_commented_first_line(self):
1063 lines = (
1064 b"#print('\xc2\xa3')\n",
1065 b'# vim: set fileencoding=iso8859-15 :\n',
1066 b"print('\xe2\x82\xac')\n"
1067 )
1068 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1069 self.assertEqual(encoding, 'iso8859-15')
1070 expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
1071 self.assertEqual(consumed_lines, expected)
1072
1073 def test_cookie_second_line_empty_first_line(self):
1074 lines = (
1075 b'\n',
1076 b'# vim: set fileencoding=iso8859-15 :\n',
1077 b"print('\xe2\x82\xac')\n"
1078 )
1079 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1080 self.assertEqual(encoding, 'iso8859-15')
1081 expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
1082 self.assertEqual(consumed_lines, expected)
1083
Benjamin Petersond3afada2009-10-09 21:43:09 +00001084 def test_latin1_normalization(self):
1085 # See get_normal_name() in tokenizer.c.
1086 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
1087 "iso-8859-1-unix", "iso-latin-1-mac")
1088 for encoding in encodings:
1089 for rep in ("-", "_"):
1090 enc = encoding.replace("-", rep)
1091 lines = (b"#!/usr/bin/python\n",
1092 b"# coding: " + enc.encode("ascii") + b"\n",
1093 b"print(things)\n",
1094 b"do_something += 4\n")
1095 rl = self.get_readline(lines)
1096 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001097 self.assertEqual(found, "iso-8859-1")
Benjamin Petersond3afada2009-10-09 21:43:09 +00001098
Martin v. Löwis63674f42012-04-20 14:36:47 +02001099 def test_syntaxerror_latin1(self):
1100 # Issue 14629: need to raise SyntaxError if the first
1101 # line(s) have non-UTF-8 characters
1102 lines = (
1103 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1104 )
1105 readline = self.get_readline(lines)
1106 self.assertRaises(SyntaxError, detect_encoding, readline)
1107
1108
Benjamin Petersond3afada2009-10-09 21:43:09 +00001109 def test_utf8_normalization(self):
1110 # See get_normal_name() in tokenizer.c.
1111 encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
1112 for encoding in encodings:
1113 for rep in ("-", "_"):
1114 enc = encoding.replace("-", rep)
1115 lines = (b"#!/usr/bin/python\n",
1116 b"# coding: " + enc.encode("ascii") + b"\n",
1117 b"1 + 3\n")
1118 rl = self.get_readline(lines)
1119 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001120 self.assertEqual(found, "utf-8")
Benjamin Petersond3afada2009-10-09 21:43:09 +00001121
Trent Nelson428de652008-03-18 22:41:35 +00001122 def test_short_files(self):
1123 readline = self.get_readline((b'print(something)\n',))
1124 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001125 self.assertEqual(encoding, 'utf-8')
1126 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001127
1128 encoding, consumed_lines = detect_encoding(self.get_readline(()))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001129 self.assertEqual(encoding, 'utf-8')
1130 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +00001131
1132 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
1133 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001134 self.assertEqual(encoding, 'utf-8-sig')
1135 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001136
1137 readline = self.get_readline((b'\xef\xbb\xbf',))
1138 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001139 self.assertEqual(encoding, 'utf-8-sig')
1140 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +00001141
Benjamin Peterson433f32c2008-12-12 01:25:05 +00001142 readline = self.get_readline((b'# coding: bad\n',))
1143 self.assertRaises(SyntaxError, detect_encoding, readline)
Trent Nelson428de652008-03-18 22:41:35 +00001144
Serhiy Storchakadafea852013-09-16 23:51:56 +03001145 def test_false_encoding(self):
1146 # Issue 18873: "Encoding" detected in non-comment lines
1147 readline = self.get_readline((b'print("#coding=fake")',))
1148 encoding, consumed_lines = detect_encoding(readline)
1149 self.assertEqual(encoding, 'utf-8')
1150 self.assertEqual(consumed_lines, [b'print("#coding=fake")'])
1151
Victor Stinner58c07522010-11-09 01:08:59 +00001152 def test_open(self):
1153 filename = support.TESTFN + '.py'
1154 self.addCleanup(support.unlink, filename)
1155
1156 # test coding cookie
1157 for encoding in ('iso-8859-15', 'utf-8'):
1158 with open(filename, 'w', encoding=encoding) as fp:
1159 print("# coding: %s" % encoding, file=fp)
1160 print("print('euro:\u20ac')", file=fp)
1161 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001162 self.assertEqual(fp.encoding, encoding)
1163 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001164
1165 # test BOM (no coding cookie)
1166 with open(filename, 'w', encoding='utf-8-sig') as fp:
1167 print("print('euro:\u20ac')", file=fp)
1168 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001169 self.assertEqual(fp.encoding, 'utf-8-sig')
1170 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001171
Brett Cannonc33f3f22012-04-20 13:23:54 -04001172 def test_filename_in_exception(self):
1173 # When possible, include the file name in the exception.
1174 path = 'some_file_path'
1175 lines = (
1176 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1177 )
1178 class Bunk:
1179 def __init__(self, lines, path):
1180 self.name = path
1181 self._lines = lines
1182 self._index = 0
1183
1184 def readline(self):
1185 if self._index == len(lines):
1186 raise StopIteration
1187 line = lines[self._index]
1188 self._index += 1
1189 return line
1190
1191 with self.assertRaises(SyntaxError):
1192 ins = Bunk(lines, path)
1193 # Make sure lacking a name isn't an issue.
1194 del ins.name
1195 detect_encoding(ins.readline)
1196 with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
1197 ins = Bunk(lines, path)
1198 detect_encoding(ins.readline)
1199
Victor Stinner387729e2015-05-26 00:43:58 +02001200 def test_open_error(self):
1201 # Issue #23840: open() must close the binary file on error
1202 m = BytesIO(b'#coding:xxx')
1203 with mock.patch('tokenize._builtin_open', return_value=m):
1204 self.assertRaises(SyntaxError, tokenize_open, 'foobar')
1205 self.assertTrue(m.closed)
1206
1207
Trent Nelson428de652008-03-18 22:41:35 +00001208class TestTokenize(TestCase):
1209
1210 def test_tokenize(self):
1211 import tokenize as tokenize_module
1212 encoding = object()
1213 encoding_used = None
1214 def mock_detect_encoding(readline):
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001215 return encoding, [b'first', b'second']
Trent Nelson428de652008-03-18 22:41:35 +00001216
1217 def mock__tokenize(readline, encoding):
1218 nonlocal encoding_used
1219 encoding_used = encoding
1220 out = []
1221 while True:
1222 next_line = readline()
1223 if next_line:
1224 out.append(next_line)
1225 continue
1226 return out
1227
1228 counter = 0
1229 def mock_readline():
1230 nonlocal counter
1231 counter += 1
1232 if counter == 5:
1233 return b''
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001234 return str(counter).encode()
Trent Nelson428de652008-03-18 22:41:35 +00001235
1236 orig_detect_encoding = tokenize_module.detect_encoding
1237 orig__tokenize = tokenize_module._tokenize
1238 tokenize_module.detect_encoding = mock_detect_encoding
1239 tokenize_module._tokenize = mock__tokenize
1240 try:
1241 results = tokenize(mock_readline)
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001242 self.assertEqual(list(results),
1243 [b'first', b'second', b'1', b'2', b'3', b'4'])
Trent Nelson428de652008-03-18 22:41:35 +00001244 finally:
1245 tokenize_module.detect_encoding = orig_detect_encoding
1246 tokenize_module._tokenize = orig__tokenize
1247
1248 self.assertTrue(encoding_used, encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +00001249
Yury Selivanov8085b802015-05-18 12:50:52 -04001250 def test_oneline_defs(self):
1251 buf = []
1252 for i in range(500):
1253 buf.append('def i{i}(): return {i}'.format(i=i))
1254 buf.append('OK')
1255 buf = '\n'.join(buf)
1256
1257 # Test that 500 consequent, one-line defs is OK
1258 toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
1259 self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER
1260
Meador Inge00c7f852012-01-19 00:44:45 -06001261 def assertExactTypeEqual(self, opstr, *optypes):
1262 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
1263 num_optypes = len(optypes)
1264 self.assertEqual(len(tokens), 2 + num_optypes)
1265 self.assertEqual(token.tok_name[tokens[0].exact_type],
1266 token.tok_name[ENCODING])
1267 for i in range(num_optypes):
1268 self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
1269 token.tok_name[optypes[i]])
1270 self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
1271 token.tok_name[token.ENDMARKER])
1272
1273 def test_exact_type(self):
1274 self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
1275 self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
1276 self.assertExactTypeEqual(':', token.COLON)
1277 self.assertExactTypeEqual(',', token.COMMA)
1278 self.assertExactTypeEqual(';', token.SEMI)
1279 self.assertExactTypeEqual('+', token.PLUS)
1280 self.assertExactTypeEqual('-', token.MINUS)
1281 self.assertExactTypeEqual('*', token.STAR)
1282 self.assertExactTypeEqual('/', token.SLASH)
1283 self.assertExactTypeEqual('|', token.VBAR)
1284 self.assertExactTypeEqual('&', token.AMPER)
1285 self.assertExactTypeEqual('<', token.LESS)
1286 self.assertExactTypeEqual('>', token.GREATER)
1287 self.assertExactTypeEqual('=', token.EQUAL)
1288 self.assertExactTypeEqual('.', token.DOT)
1289 self.assertExactTypeEqual('%', token.PERCENT)
1290 self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
1291 self.assertExactTypeEqual('==', token.EQEQUAL)
1292 self.assertExactTypeEqual('!=', token.NOTEQUAL)
1293 self.assertExactTypeEqual('<=', token.LESSEQUAL)
1294 self.assertExactTypeEqual('>=', token.GREATEREQUAL)
1295 self.assertExactTypeEqual('~', token.TILDE)
1296 self.assertExactTypeEqual('^', token.CIRCUMFLEX)
1297 self.assertExactTypeEqual('<<', token.LEFTSHIFT)
1298 self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
1299 self.assertExactTypeEqual('**', token.DOUBLESTAR)
1300 self.assertExactTypeEqual('+=', token.PLUSEQUAL)
1301 self.assertExactTypeEqual('-=', token.MINEQUAL)
1302 self.assertExactTypeEqual('*=', token.STAREQUAL)
1303 self.assertExactTypeEqual('/=', token.SLASHEQUAL)
1304 self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
1305 self.assertExactTypeEqual('&=', token.AMPEREQUAL)
1306 self.assertExactTypeEqual('|=', token.VBAREQUAL)
1307 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1308 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1309 self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
1310 self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
1311 self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
1312 self.assertExactTypeEqual('//', token.DOUBLESLASH)
1313 self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
1314 self.assertExactTypeEqual('@', token.AT)
Benjamin Petersond51374e2014-04-09 23:55:56 -04001315 self.assertExactTypeEqual('@=', token.ATEQUAL)
Meador Inge00c7f852012-01-19 00:44:45 -06001316
1317 self.assertExactTypeEqual('a**2+b**2==c**2',
1318 NAME, token.DOUBLESTAR, NUMBER,
1319 token.PLUS,
1320 NAME, token.DOUBLESTAR, NUMBER,
1321 token.EQEQUAL,
1322 NAME, token.DOUBLESTAR, NUMBER)
1323 self.assertExactTypeEqual('{1, 2, 3}',
1324 token.LBRACE,
1325 token.NUMBER, token.COMMA,
1326 token.NUMBER, token.COMMA,
1327 token.NUMBER,
1328 token.RBRACE)
1329 self.assertExactTypeEqual('^(x & 0x1)',
1330 token.CIRCUMFLEX,
1331 token.LPAR,
1332 token.NAME, token.AMPER, token.NUMBER,
1333 token.RPAR)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001334
Ezio Melottifafa8b72012-11-03 17:46:51 +02001335 def test_pathological_trailing_whitespace(self):
1336 # See http://bugs.python.org/issue16152
1337 self.assertExactTypeEqual('@ ', token.AT)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001338
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001339
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001340class UntokenizeTest(TestCase):
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001341
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001342 def test_bad_input_order(self):
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001343 # raise if previous row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001344 u = Untokenizer()
1345 u.prev_row = 2
1346 u.prev_col = 2
1347 with self.assertRaises(ValueError) as cm:
1348 u.add_whitespace((1,3))
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001349 self.assertEqual(cm.exception.args[0],
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001350 'start (1,3) precedes previous end (2,2)')
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001351 # raise if previous column in row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001352 self.assertRaises(ValueError, u.add_whitespace, (2,1))
1353
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001354 def test_backslash_continuation(self):
1355 # The problem is that <whitespace>\<newline> leaves no token
1356 u = Untokenizer()
1357 u.prev_row = 1
1358 u.prev_col = 1
1359 u.tokens = []
1360 u.add_whitespace((2, 0))
1361 self.assertEqual(u.tokens, ['\\\n'])
1362 u.prev_row = 2
1363 u.add_whitespace((4, 4))
1364 self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' '])
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001365 TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n')
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001366
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -05001367 def test_iter_compat(self):
1368 u = Untokenizer()
1369 token = (NAME, 'Hello')
1370 tokens = [(ENCODING, 'utf-8'), token]
1371 u.compat(token, iter([]))
1372 self.assertEqual(u.tokens, ["Hello "])
1373 u = Untokenizer()
1374 self.assertEqual(u.untokenize(iter([token])), 'Hello ')
1375 u = Untokenizer()
1376 self.assertEqual(u.untokenize(iter(tokens)), 'Hello ')
1377 self.assertEqual(u.encoding, 'utf-8')
1378 self.assertEqual(untokenize(iter(tokens)), b'Hello ')
1379
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001380
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001381class TestRoundtrip(TestCase):
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001382
1383 def check_roundtrip(self, f):
1384 """
1385 Test roundtrip for `untokenize`. `f` is an open file or a string.
1386 The source code in f is tokenized to both 5- and 2-tuples.
1387 Both sequences are converted back to source code via
1388 tokenize.untokenize(), and the latter tokenized again to 2-tuples.
1389 The test fails if the 3 pair tokenizations do not match.
1390
1391 When untokenize bugs are fixed, untokenize with 5-tuples should
1392 reproduce code that does not contain a backslash continuation
1393 following spaces. A proper test should test this.
1394 """
1395 # Get source code and original tokenizations
1396 if isinstance(f, str):
1397 code = f.encode('utf-8')
1398 else:
1399 code = f.read()
1400 f.close()
1401 readline = iter(code.splitlines(keepends=True)).__next__
1402 tokens5 = list(tokenize(readline))
1403 tokens2 = [tok[:2] for tok in tokens5]
1404 # Reproduce tokens2 from pairs
1405 bytes_from2 = untokenize(tokens2)
1406 readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__
1407 tokens2_from2 = [tok[:2] for tok in tokenize(readline2)]
1408 self.assertEqual(tokens2_from2, tokens2)
1409 # Reproduce tokens2 from 5-tuples
1410 bytes_from5 = untokenize(tokens5)
1411 readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__
1412 tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
1413 self.assertEqual(tokens2_from5, tokens2)
1414
1415 def test_roundtrip(self):
1416 # There are some standard formatting practices that are easy to get right.
1417
1418 self.check_roundtrip("if x == 1:\n"
1419 " print(x)\n")
1420 self.check_roundtrip("# This is a comment\n"
1421 "# This also")
1422
1423 # Some people use different formatting conventions, which makes
1424 # untokenize a little trickier. Note that this test involves trailing
1425 # whitespace after the colon. Note that we use hex escapes to make the
1426 # two trailing blanks apparent in the expected output.
1427
1428 self.check_roundtrip("if x == 1 : \n"
1429 " print(x)\n")
1430 fn = support.findfile("tokenize_tests.txt")
1431 with open(fn, 'rb') as f:
1432 self.check_roundtrip(f)
1433 self.check_roundtrip("if x == 1:\n"
1434 " # A comment by itself.\n"
1435 " print(x) # Comment here, too.\n"
1436 " # Another comment.\n"
1437 "after_if = True\n")
1438 self.check_roundtrip("if (x # The comments need to go in the right place\n"
1439 " == 1):\n"
1440 " print('x==1')\n")
1441 self.check_roundtrip("class Test: # A comment here\n"
1442 " # A comment with weird indent\n"
1443 " after_com = 5\n"
1444 " def x(m): return m*5 # a one liner\n"
1445 " def y(m): # A whitespace after the colon\n"
1446 " return y*4 # 3-space indent\n")
1447
1448 # Some error-handling code
1449 self.check_roundtrip("try: import somemodule\n"
1450 "except ImportError: # comment\n"
1451 " print('Can not import' # comment2\n)"
1452 "else: print('Loaded')\n")
1453
1454 def test_continuation(self):
1455 # Balancing continuation
1456 self.check_roundtrip("a = (3,4, \n"
1457 "5,6)\n"
1458 "y = [3, 4,\n"
1459 "5]\n"
1460 "z = {'a': 5,\n"
1461 "'b':15, 'c':True}\n"
1462 "x = len(y) + 5 - a[\n"
1463 "3] - a[2]\n"
1464 "+ len(z) - z[\n"
1465 "'b']\n")
1466
1467 def test_backslash_continuation(self):
1468 # Backslash means line continuation, except for comments
1469 self.check_roundtrip("x=1+\\\n"
1470 "1\n"
1471 "# This is a comment\\\n"
1472 "# This also\n")
1473 self.check_roundtrip("# Comment \\\n"
1474 "x = 0")
1475
1476 def test_string_concatenation(self):
1477 # Two string literals on the same line
1478 self.check_roundtrip("'' ''")
1479
1480 def test_random_files(self):
1481 # Test roundtrip on random python modules.
1482 # pass the '-ucpu' option to process the full directory.
1483
1484 import glob, random
1485 fn = support.findfile("tokenize_tests.txt")
1486 tempdir = os.path.dirname(fn) or os.curdir
1487 testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
1488
1489 # Tokenize is broken on test_pep3131.py because regular expressions are
1490 # broken on the obscure unicode identifiers in it. *sigh*
1491 # With roundtrip extended to test the 5-tuple mode of untokenize,
1492 # 7 more testfiles fail. Remove them also until the failure is diagnosed.
1493
1494 testfiles.remove(os.path.join(tempdir, "test_pep3131.py"))
1495 for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'):
1496 testfiles.remove(os.path.join(tempdir, "test_%s.py") % f)
1497
1498 if not support.is_resource_enabled("cpu"):
1499 testfiles = random.sample(testfiles, 10)
1500
1501 for testfile in testfiles:
1502 with open(testfile, 'rb') as f:
1503 with self.subTest(file=testfile):
1504 self.check_roundtrip(f)
1505
1506
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001507 def roundtrip(self, code):
1508 if isinstance(code, str):
1509 code = code.encode('utf-8')
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001510 return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001511
1512 def test_indentation_semantics_retained(self):
1513 """
1514 Ensure that although whitespace might be mutated in a roundtrip,
1515 the semantic meaning of the indentation remains consistent.
1516 """
1517 code = "if False:\n\tx=3\n\tx=3\n"
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001518 codelines = self.roundtrip(code).split('\n')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001519 self.assertEqual(codelines[1], codelines[2])
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001520 self.check_roundtrip(code)
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001521
1522
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001523if __name__ == "__main__":
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001524 unittest.main()