blob: 90438e7d3077d7d0208b6735a0bce87e9717e1f5 [file] [log] [blame]
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001from test import support
2from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
3 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
4 open as tokenize_open, Untokenizer)
5from io import BytesIO
6from unittest import TestCase, mock
7import os
8import token
Thomas Wouters89f507f2006-12-13 04:49:30 +00009
Thomas Wouters89f507f2006-12-13 04:49:30 +000010
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030011class TokenizeTest(TestCase):
12 # Tests for the tokenize module.
Jason R. Coombs7cf36382015-06-20 19:13:50 -040013
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030014 # The tests can be really simple. Given a small fragment of source
15 # code, print out a table with tokens. The ENDMARKER is omitted for
16 # brevity.
17
18 def check_tokenize(self, s, expected):
19 # Format the tokens in s in a table format.
20 # The ENDMARKER is omitted.
21 result = []
22 f = BytesIO(s.encode('utf-8'))
23 for type, token, start, end, line in tokenize(f.readline):
24 if type == ENDMARKER:
25 break
26 type = tok_name[type]
Eric V. Smith67317742015-10-16 20:45:53 -040027 result.append(f" {type:10} {token!r:13} {start} {end}")
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030028 self.assertEqual(result,
29 [" ENCODING 'utf-8' (0, 0) (0, 0)"] +
30 expected.rstrip().splitlines())
31
32 def test_basic(self):
33 self.check_tokenize("1 + 1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000034 NUMBER '1' (1, 0) (1, 1)
35 OP '+' (1, 2) (1, 3)
36 NUMBER '1' (1, 4) (1, 5)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030037 """)
38 self.check_tokenize("if False:\n"
39 " # NL\n"
40 " True = False # NEWLINE\n", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000041 NAME 'if' (1, 0) (1, 2)
42 NAME 'False' (1, 3) (1, 8)
43 OP ':' (1, 8) (1, 9)
44 NEWLINE '\\n' (1, 9) (1, 10)
45 COMMENT '# NL' (2, 4) (2, 8)
46 NL '\\n' (2, 8) (2, 9)
47 INDENT ' ' (3, 0) (3, 4)
48 NAME 'True' (3, 4) (3, 8)
49 OP '=' (3, 9) (3, 10)
50 NAME 'False' (3, 11) (3, 16)
51 COMMENT '# NEWLINE' (3, 17) (3, 26)
52 NEWLINE '\\n' (3, 26) (3, 27)
53 DEDENT '' (4, 0) (4, 0)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030054 """)
55 indent_error_file = b"""\
56def k(x):
57 x += 2
58 x += 5
59"""
60 readline = BytesIO(indent_error_file).readline
61 with self.assertRaisesRegex(IndentationError,
62 "unindent does not match any "
63 "outer indentation level"):
64 for tok in tokenize(readline):
65 pass
Thomas Wouters89f507f2006-12-13 04:49:30 +000066
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030067 def test_int(self):
68 # Ordinary integers and binary operators
69 self.check_tokenize("0xff <= 255", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000070 NUMBER '0xff' (1, 0) (1, 4)
71 OP '<=' (1, 5) (1, 7)
72 NUMBER '255' (1, 8) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030073 """)
74 self.check_tokenize("0b10 <= 255", """\
Eric Smith74ca5572008-03-17 19:49:19 +000075 NUMBER '0b10' (1, 0) (1, 4)
76 OP '<=' (1, 5) (1, 7)
77 NUMBER '255' (1, 8) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030078 """)
79 self.check_tokenize("0o123 <= 0O123", """\
Eric Smith74ca5572008-03-17 19:49:19 +000080 NUMBER '0o123' (1, 0) (1, 5)
81 OP '<=' (1, 6) (1, 8)
82 NUMBER '0O123' (1, 9) (1, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030083 """)
84 self.check_tokenize("1234567 > ~0x15", """\
Mark Dickinson0c1f7c02008-03-16 05:05:12 +000085 NUMBER '1234567' (1, 0) (1, 7)
86 OP '>' (1, 8) (1, 9)
87 OP '~' (1, 10) (1, 11)
88 NUMBER '0x15' (1, 11) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030089 """)
90 self.check_tokenize("2134568 != 1231515", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000091 NUMBER '2134568' (1, 0) (1, 7)
92 OP '!=' (1, 8) (1, 10)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +000093 NUMBER '1231515' (1, 11) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030094 """)
95 self.check_tokenize("(-124561-1) & 200000000", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000096 OP '(' (1, 0) (1, 1)
97 OP '-' (1, 1) (1, 2)
98 NUMBER '124561' (1, 2) (1, 8)
99 OP '-' (1, 8) (1, 9)
100 NUMBER '1' (1, 9) (1, 10)
101 OP ')' (1, 10) (1, 11)
102 OP '&' (1, 12) (1, 13)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000103 NUMBER '200000000' (1, 14) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300104 """)
105 self.check_tokenize("0xdeadbeef != -1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000106 NUMBER '0xdeadbeef' (1, 0) (1, 10)
107 OP '!=' (1, 11) (1, 13)
108 OP '-' (1, 14) (1, 15)
109 NUMBER '1' (1, 15) (1, 16)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300110 """)
111 self.check_tokenize("0xdeadc0de & 12345", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000112 NUMBER '0xdeadc0de' (1, 0) (1, 10)
113 OP '&' (1, 11) (1, 12)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000114 NUMBER '12345' (1, 13) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300115 """)
116 self.check_tokenize("0xFF & 0x15 | 1234", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000117 NUMBER '0xFF' (1, 0) (1, 4)
118 OP '&' (1, 5) (1, 6)
119 NUMBER '0x15' (1, 7) (1, 11)
120 OP '|' (1, 12) (1, 13)
121 NUMBER '1234' (1, 14) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300122 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000123
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300124 def test_long(self):
125 # Long integers
126 self.check_tokenize("x = 0", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000127 NAME 'x' (1, 0) (1, 1)
128 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000129 NUMBER '0' (1, 4) (1, 5)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300130 """)
131 self.check_tokenize("x = 0xfffffffffff", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000132 NAME 'x' (1, 0) (1, 1)
133 OP '=' (1, 2) (1, 3)
Eric V. Smith67317742015-10-16 20:45:53 -0400134 NUMBER '0xfffffffffff' (1, 4) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300135 """)
136 self.check_tokenize("x = 123141242151251616110", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000137 NAME 'x' (1, 0) (1, 1)
138 OP '=' (1, 2) (1, 3)
Eric V. Smith67317742015-10-16 20:45:53 -0400139 NUMBER '123141242151251616110' (1, 4) (1, 25)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300140 """)
141 self.check_tokenize("x = -15921590215012591", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000142 NAME 'x' (1, 0) (1, 1)
143 OP '=' (1, 2) (1, 3)
144 OP '-' (1, 4) (1, 5)
Eric V. Smith67317742015-10-16 20:45:53 -0400145 NUMBER '15921590215012591' (1, 5) (1, 22)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300146 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000147
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300148 def test_float(self):
149 # Floating point numbers
150 self.check_tokenize("x = 3.14159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000151 NAME 'x' (1, 0) (1, 1)
152 OP '=' (1, 2) (1, 3)
153 NUMBER '3.14159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300154 """)
155 self.check_tokenize("x = 314159.", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000156 NAME 'x' (1, 0) (1, 1)
157 OP '=' (1, 2) (1, 3)
158 NUMBER '314159.' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300159 """)
160 self.check_tokenize("x = .314159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000161 NAME 'x' (1, 0) (1, 1)
162 OP '=' (1, 2) (1, 3)
163 NUMBER '.314159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300164 """)
165 self.check_tokenize("x = 3e14159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000166 NAME 'x' (1, 0) (1, 1)
167 OP '=' (1, 2) (1, 3)
168 NUMBER '3e14159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300169 """)
170 self.check_tokenize("x = 3E123", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000171 NAME 'x' (1, 0) (1, 1)
172 OP '=' (1, 2) (1, 3)
173 NUMBER '3E123' (1, 4) (1, 9)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300174 """)
175 self.check_tokenize("x+y = 3e-1230", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000176 NAME 'x' (1, 0) (1, 1)
177 OP '+' (1, 1) (1, 2)
178 NAME 'y' (1, 2) (1, 3)
179 OP '=' (1, 4) (1, 5)
180 NUMBER '3e-1230' (1, 6) (1, 13)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300181 """)
182 self.check_tokenize("x = 3.14e159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000183 NAME 'x' (1, 0) (1, 1)
184 OP '=' (1, 2) (1, 3)
185 NUMBER '3.14e159' (1, 4) (1, 12)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300186 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000187
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300188 def test_string(self):
189 # String literals
190 self.check_tokenize("x = ''; y = \"\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000191 NAME 'x' (1, 0) (1, 1)
192 OP '=' (1, 2) (1, 3)
193 STRING "''" (1, 4) (1, 6)
194 OP ';' (1, 6) (1, 7)
195 NAME 'y' (1, 8) (1, 9)
196 OP '=' (1, 10) (1, 11)
197 STRING '""' (1, 12) (1, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300198 """)
199 self.check_tokenize("x = '\"'; y = \"'\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000200 NAME 'x' (1, 0) (1, 1)
201 OP '=' (1, 2) (1, 3)
202 STRING '\\'"\\'' (1, 4) (1, 7)
203 OP ';' (1, 7) (1, 8)
204 NAME 'y' (1, 9) (1, 10)
205 OP '=' (1, 11) (1, 12)
206 STRING '"\\'"' (1, 13) (1, 16)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300207 """)
208 self.check_tokenize("x = \"doesn't \"shrink\", does it\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000209 NAME 'x' (1, 0) (1, 1)
210 OP '=' (1, 2) (1, 3)
211 STRING '"doesn\\'t "' (1, 4) (1, 14)
212 NAME 'shrink' (1, 14) (1, 20)
213 STRING '", does it"' (1, 20) (1, 31)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300214 """)
215 self.check_tokenize("x = 'abc' + 'ABC'", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000216 NAME 'x' (1, 0) (1, 1)
217 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000218 STRING "'abc'" (1, 4) (1, 9)
219 OP '+' (1, 10) (1, 11)
220 STRING "'ABC'" (1, 12) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300221 """)
222 self.check_tokenize('y = "ABC" + "ABC"', """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000223 NAME 'y' (1, 0) (1, 1)
224 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000225 STRING '"ABC"' (1, 4) (1, 9)
226 OP '+' (1, 10) (1, 11)
227 STRING '"ABC"' (1, 12) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300228 """)
229 self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000230 NAME 'x' (1, 0) (1, 1)
231 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000232 STRING "r'abc'" (1, 4) (1, 10)
233 OP '+' (1, 11) (1, 12)
234 STRING "r'ABC'" (1, 13) (1, 19)
235 OP '+' (1, 20) (1, 21)
236 STRING "R'ABC'" (1, 22) (1, 28)
237 OP '+' (1, 29) (1, 30)
238 STRING "R'ABC'" (1, 31) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300239 """)
240 self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000241 NAME 'y' (1, 0) (1, 1)
242 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000243 STRING 'r"abc"' (1, 4) (1, 10)
244 OP '+' (1, 11) (1, 12)
245 STRING 'r"ABC"' (1, 13) (1, 19)
246 OP '+' (1, 20) (1, 21)
247 STRING 'R"ABC"' (1, 22) (1, 28)
248 OP '+' (1, 29) (1, 30)
249 STRING 'R"ABC"' (1, 31) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300250 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000251
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300252 self.check_tokenize("u'abc' + U'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500253 STRING "u'abc'" (1, 0) (1, 6)
254 OP '+' (1, 7) (1, 8)
255 STRING "U'abc'" (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300256 """)
257 self.check_tokenize('u"abc" + U"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500258 STRING 'u"abc"' (1, 0) (1, 6)
259 OP '+' (1, 7) (1, 8)
260 STRING 'U"abc"' (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300261 """)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500262
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300263 self.check_tokenize("b'abc' + B'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500264 STRING "b'abc'" (1, 0) (1, 6)
265 OP '+' (1, 7) (1, 8)
266 STRING "B'abc'" (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300267 """)
268 self.check_tokenize('b"abc" + B"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500269 STRING 'b"abc"' (1, 0) (1, 6)
270 OP '+' (1, 7) (1, 8)
271 STRING 'B"abc"' (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300272 """)
273 self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500274 STRING "br'abc'" (1, 0) (1, 7)
275 OP '+' (1, 8) (1, 9)
276 STRING "bR'abc'" (1, 10) (1, 17)
277 OP '+' (1, 18) (1, 19)
278 STRING "Br'abc'" (1, 20) (1, 27)
279 OP '+' (1, 28) (1, 29)
280 STRING "BR'abc'" (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300281 """)
282 self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500283 STRING 'br"abc"' (1, 0) (1, 7)
284 OP '+' (1, 8) (1, 9)
285 STRING 'bR"abc"' (1, 10) (1, 17)
286 OP '+' (1, 18) (1, 19)
287 STRING 'Br"abc"' (1, 20) (1, 27)
288 OP '+' (1, 28) (1, 29)
289 STRING 'BR"abc"' (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300290 """)
291 self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500292 STRING "rb'abc'" (1, 0) (1, 7)
293 OP '+' (1, 8) (1, 9)
294 STRING "rB'abc'" (1, 10) (1, 17)
295 OP '+' (1, 18) (1, 19)
296 STRING "Rb'abc'" (1, 20) (1, 27)
297 OP '+' (1, 28) (1, 29)
298 STRING "RB'abc'" (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300299 """)
300 self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500301 STRING 'rb"abc"' (1, 0) (1, 7)
302 OP '+' (1, 8) (1, 9)
303 STRING 'rB"abc"' (1, 10) (1, 17)
304 OP '+' (1, 18) (1, 19)
305 STRING 'Rb"abc"' (1, 20) (1, 27)
306 OP '+' (1, 28) (1, 29)
307 STRING 'RB"abc"' (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300308 """)
Eric V. Smith67317742015-10-16 20:45:53 -0400309 # Check 0, 1, and 2 character string prefixes.
310 self.check_tokenize(r'"a\
311de\
312fg"', """\
313 STRING '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3)
314 """)
315 self.check_tokenize(r'u"a\
316de"', """\
317 STRING 'u"a\\\\\\nde"\' (1, 0) (2, 3)
318 """)
319 self.check_tokenize(r'rb"a\
320d"', """\
321 STRING 'rb"a\\\\\\nd"\' (1, 0) (2, 2)
322 """)
323 self.check_tokenize(r'"""a\
324b"""', """\
325 STRING '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
326 """)
327 self.check_tokenize(r'u"""a\
328b"""', """\
329 STRING 'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
330 """)
331 self.check_tokenize(r'rb"""a\
332b\
333c"""', """\
334 STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
335 """)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400336 self.check_tokenize('f"abc"', """\
337 STRING 'f"abc"' (1, 0) (1, 6)
338 """)
339 self.check_tokenize('fR"a{b}c"', """\
340 STRING 'fR"a{b}c"' (1, 0) (1, 9)
341 """)
342 self.check_tokenize('f"""abc"""', """\
343 STRING 'f\"\"\"abc\"\"\"' (1, 0) (1, 10)
344 """)
345 self.check_tokenize(r'f"abc\
346def"', """\
347 STRING 'f"abc\\\\\\ndef"' (1, 0) (2, 4)
348 """)
349 self.check_tokenize(r'Rf"abc\
350def"', """\
351 STRING 'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
352 """)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500353
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300354 def test_function(self):
355 self.check_tokenize("def d22(a, b, c=2, d=2, *k): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000356 NAME 'def' (1, 0) (1, 3)
357 NAME 'd22' (1, 4) (1, 7)
358 OP '(' (1, 7) (1, 8)
359 NAME 'a' (1, 8) (1, 9)
360 OP ',' (1, 9) (1, 10)
361 NAME 'b' (1, 11) (1, 12)
362 OP ',' (1, 12) (1, 13)
363 NAME 'c' (1, 14) (1, 15)
364 OP '=' (1, 15) (1, 16)
365 NUMBER '2' (1, 16) (1, 17)
366 OP ',' (1, 17) (1, 18)
367 NAME 'd' (1, 19) (1, 20)
368 OP '=' (1, 20) (1, 21)
369 NUMBER '2' (1, 21) (1, 22)
370 OP ',' (1, 22) (1, 23)
371 OP '*' (1, 24) (1, 25)
372 NAME 'k' (1, 25) (1, 26)
373 OP ')' (1, 26) (1, 27)
374 OP ':' (1, 27) (1, 28)
375 NAME 'pass' (1, 29) (1, 33)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300376 """)
377 self.check_tokenize("def d01v_(a=1, *k, **w): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000378 NAME 'def' (1, 0) (1, 3)
379 NAME 'd01v_' (1, 4) (1, 9)
380 OP '(' (1, 9) (1, 10)
381 NAME 'a' (1, 10) (1, 11)
382 OP '=' (1, 11) (1, 12)
383 NUMBER '1' (1, 12) (1, 13)
384 OP ',' (1, 13) (1, 14)
385 OP '*' (1, 15) (1, 16)
386 NAME 'k' (1, 16) (1, 17)
387 OP ',' (1, 17) (1, 18)
388 OP '**' (1, 19) (1, 21)
389 NAME 'w' (1, 21) (1, 22)
390 OP ')' (1, 22) (1, 23)
391 OP ':' (1, 23) (1, 24)
392 NAME 'pass' (1, 25) (1, 29)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300393 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000394
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300395 def test_comparison(self):
396 # Comparison
397 self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != "
398 "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000399 NAME 'if' (1, 0) (1, 2)
400 NUMBER '1' (1, 3) (1, 4)
401 OP '<' (1, 5) (1, 6)
402 NUMBER '1' (1, 7) (1, 8)
403 OP '>' (1, 9) (1, 10)
404 NUMBER '1' (1, 11) (1, 12)
405 OP '==' (1, 13) (1, 15)
406 NUMBER '1' (1, 16) (1, 17)
407 OP '>=' (1, 18) (1, 20)
408 NUMBER '5' (1, 21) (1, 22)
409 OP '<=' (1, 23) (1, 25)
410 NUMBER '0x15' (1, 26) (1, 30)
411 OP '<=' (1, 31) (1, 33)
412 NUMBER '0x12' (1, 34) (1, 38)
413 OP '!=' (1, 39) (1, 41)
414 NUMBER '1' (1, 42) (1, 43)
415 NAME 'and' (1, 44) (1, 47)
416 NUMBER '5' (1, 48) (1, 49)
417 NAME 'in' (1, 50) (1, 52)
418 NUMBER '1' (1, 53) (1, 54)
419 NAME 'not' (1, 55) (1, 58)
420 NAME 'in' (1, 59) (1, 61)
421 NUMBER '1' (1, 62) (1, 63)
422 NAME 'is' (1, 64) (1, 66)
423 NUMBER '1' (1, 67) (1, 68)
424 NAME 'or' (1, 69) (1, 71)
425 NUMBER '5' (1, 72) (1, 73)
426 NAME 'is' (1, 74) (1, 76)
427 NAME 'not' (1, 77) (1, 80)
428 NUMBER '1' (1, 81) (1, 82)
429 OP ':' (1, 82) (1, 83)
430 NAME 'pass' (1, 84) (1, 88)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300431 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000432
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300433 def test_shift(self):
434 # Shift
435 self.check_tokenize("x = 1 << 1 >> 5", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000436 NAME 'x' (1, 0) (1, 1)
437 OP '=' (1, 2) (1, 3)
438 NUMBER '1' (1, 4) (1, 5)
439 OP '<<' (1, 6) (1, 8)
440 NUMBER '1' (1, 9) (1, 10)
441 OP '>>' (1, 11) (1, 13)
442 NUMBER '5' (1, 14) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300443 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000444
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300445 def test_additive(self):
446 # Additive
447 self.check_tokenize("x = 1 - y + 15 - 1 + 0x124 + z + a[5]", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000448 NAME 'x' (1, 0) (1, 1)
449 OP '=' (1, 2) (1, 3)
450 NUMBER '1' (1, 4) (1, 5)
451 OP '-' (1, 6) (1, 7)
452 NAME 'y' (1, 8) (1, 9)
453 OP '+' (1, 10) (1, 11)
454 NUMBER '15' (1, 12) (1, 14)
455 OP '-' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000456 NUMBER '1' (1, 17) (1, 18)
457 OP '+' (1, 19) (1, 20)
458 NUMBER '0x124' (1, 21) (1, 26)
459 OP '+' (1, 27) (1, 28)
460 NAME 'z' (1, 29) (1, 30)
461 OP '+' (1, 31) (1, 32)
462 NAME 'a' (1, 33) (1, 34)
463 OP '[' (1, 34) (1, 35)
464 NUMBER '5' (1, 35) (1, 36)
465 OP ']' (1, 36) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300466 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000467
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300468 def test_multiplicative(self):
469 # Multiplicative
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300470 self.check_tokenize("x = 1//1*1/5*12%0x12@42", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000471 NAME 'x' (1, 0) (1, 1)
472 OP '=' (1, 2) (1, 3)
473 NUMBER '1' (1, 4) (1, 5)
474 OP '//' (1, 5) (1, 7)
475 NUMBER '1' (1, 7) (1, 8)
476 OP '*' (1, 8) (1, 9)
477 NUMBER '1' (1, 9) (1, 10)
478 OP '/' (1, 10) (1, 11)
479 NUMBER '5' (1, 11) (1, 12)
480 OP '*' (1, 12) (1, 13)
481 NUMBER '12' (1, 13) (1, 15)
482 OP '%' (1, 15) (1, 16)
483 NUMBER '0x12' (1, 16) (1, 20)
Benjamin Petersond51374e2014-04-09 23:55:56 -0400484 OP '@' (1, 20) (1, 21)
485 NUMBER '42' (1, 21) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300486 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000487
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300488 def test_unary(self):
489 # Unary
490 self.check_tokenize("~1 ^ 1 & 1 |1 ^ -1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000491 OP '~' (1, 0) (1, 1)
492 NUMBER '1' (1, 1) (1, 2)
493 OP '^' (1, 3) (1, 4)
494 NUMBER '1' (1, 5) (1, 6)
495 OP '&' (1, 7) (1, 8)
496 NUMBER '1' (1, 9) (1, 10)
497 OP '|' (1, 11) (1, 12)
498 NUMBER '1' (1, 12) (1, 13)
499 OP '^' (1, 14) (1, 15)
500 OP '-' (1, 16) (1, 17)
501 NUMBER '1' (1, 17) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300502 """)
503 self.check_tokenize("-1*1/1+1*1//1 - ---1**1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000504 OP '-' (1, 0) (1, 1)
505 NUMBER '1' (1, 1) (1, 2)
506 OP '*' (1, 2) (1, 3)
507 NUMBER '1' (1, 3) (1, 4)
508 OP '/' (1, 4) (1, 5)
509 NUMBER '1' (1, 5) (1, 6)
510 OP '+' (1, 6) (1, 7)
511 NUMBER '1' (1, 7) (1, 8)
512 OP '*' (1, 8) (1, 9)
513 NUMBER '1' (1, 9) (1, 10)
514 OP '//' (1, 10) (1, 12)
515 NUMBER '1' (1, 12) (1, 13)
516 OP '-' (1, 14) (1, 15)
517 OP '-' (1, 16) (1, 17)
518 OP '-' (1, 17) (1, 18)
519 OP '-' (1, 18) (1, 19)
520 NUMBER '1' (1, 19) (1, 20)
521 OP '**' (1, 20) (1, 22)
522 NUMBER '1' (1, 22) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300523 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000524
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300525 def test_selector(self):
526 # Selector
527 self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000528 NAME 'import' (1, 0) (1, 6)
529 NAME 'sys' (1, 7) (1, 10)
530 OP ',' (1, 10) (1, 11)
531 NAME 'time' (1, 12) (1, 16)
532 NEWLINE '\\n' (1, 16) (1, 17)
533 NAME 'x' (2, 0) (2, 1)
534 OP '=' (2, 2) (2, 3)
535 NAME 'sys' (2, 4) (2, 7)
536 OP '.' (2, 7) (2, 8)
537 NAME 'modules' (2, 8) (2, 15)
538 OP '[' (2, 15) (2, 16)
539 STRING "'time'" (2, 16) (2, 22)
540 OP ']' (2, 22) (2, 23)
541 OP '.' (2, 23) (2, 24)
542 NAME 'time' (2, 24) (2, 28)
543 OP '(' (2, 28) (2, 29)
544 OP ')' (2, 29) (2, 30)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300545 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000546
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300547 def test_method(self):
548 # Methods
549 self.check_tokenize("@staticmethod\ndef foo(x,y): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000550 OP '@' (1, 0) (1, 1)
Eric V. Smith67317742015-10-16 20:45:53 -0400551 NAME 'staticmethod' (1, 1) (1, 13)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000552 NEWLINE '\\n' (1, 13) (1, 14)
553 NAME 'def' (2, 0) (2, 3)
554 NAME 'foo' (2, 4) (2, 7)
555 OP '(' (2, 7) (2, 8)
556 NAME 'x' (2, 8) (2, 9)
557 OP ',' (2, 9) (2, 10)
558 NAME 'y' (2, 10) (2, 11)
559 OP ')' (2, 11) (2, 12)
560 OP ':' (2, 12) (2, 13)
561 NAME 'pass' (2, 14) (2, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300562 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000563
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300564 def test_tabs(self):
565 # Evil tabs
566 self.check_tokenize("def f():\n"
567 "\tif x\n"
568 " \tpass", """\
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000569 NAME 'def' (1, 0) (1, 3)
570 NAME 'f' (1, 4) (1, 5)
571 OP '(' (1, 5) (1, 6)
572 OP ')' (1, 6) (1, 7)
573 OP ':' (1, 7) (1, 8)
574 NEWLINE '\\n' (1, 8) (1, 9)
575 INDENT '\\t' (2, 0) (2, 1)
576 NAME 'if' (2, 1) (2, 3)
577 NAME 'x' (2, 4) (2, 5)
578 NEWLINE '\\n' (2, 5) (2, 6)
579 INDENT ' \\t' (3, 0) (3, 9)
580 NAME 'pass' (3, 9) (3, 13)
581 DEDENT '' (4, 0) (4, 0)
582 DEDENT '' (4, 0) (4, 0)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300583 """)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000584
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300585 def test_non_ascii_identifiers(self):
586 # Non-ascii identifiers
587 self.check_tokenize("Örter = 'places'\ngrün = 'green'", """\
Benjamin Peterson33856de2010-08-30 14:41:20 +0000588 NAME 'Örter' (1, 0) (1, 5)
589 OP '=' (1, 6) (1, 7)
590 STRING "'places'" (1, 8) (1, 16)
591 NEWLINE '\\n' (1, 16) (1, 17)
592 NAME 'grün' (2, 0) (2, 4)
593 OP '=' (2, 5) (2, 6)
594 STRING "'green'" (2, 7) (2, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300595 """)
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000596
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300597 def test_unicode(self):
598 # Legacy unicode literals:
599 self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000600 NAME 'Örter' (1, 0) (1, 5)
601 OP '=' (1, 6) (1, 7)
602 STRING "u'places'" (1, 8) (1, 17)
603 NEWLINE '\\n' (1, 17) (1, 18)
604 NAME 'grün' (2, 0) (2, 4)
605 OP '=' (2, 5) (2, 6)
Christian Heimes0b3847d2012-06-20 11:17:58 +0200606 STRING "U'green'" (2, 7) (2, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300607 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400608
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300609 def test_async(self):
610 # Async/await extension:
611 self.check_tokenize("async = 1", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400612 NAME 'async' (1, 0) (1, 5)
613 OP '=' (1, 6) (1, 7)
614 NUMBER '1' (1, 8) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300615 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400616
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300617 self.check_tokenize("a = (async = 1)", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400618 NAME 'a' (1, 0) (1, 1)
619 OP '=' (1, 2) (1, 3)
620 OP '(' (1, 4) (1, 5)
621 NAME 'async' (1, 5) (1, 10)
622 OP '=' (1, 11) (1, 12)
623 NUMBER '1' (1, 13) (1, 14)
624 OP ')' (1, 14) (1, 15)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300625 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400626
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300627 self.check_tokenize("async()", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400628 NAME 'async' (1, 0) (1, 5)
629 OP '(' (1, 5) (1, 6)
630 OP ')' (1, 6) (1, 7)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300631 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400632
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300633 self.check_tokenize("class async(Bar):pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400634 NAME 'class' (1, 0) (1, 5)
635 NAME 'async' (1, 6) (1, 11)
636 OP '(' (1, 11) (1, 12)
637 NAME 'Bar' (1, 12) (1, 15)
638 OP ')' (1, 15) (1, 16)
639 OP ':' (1, 16) (1, 17)
640 NAME 'pass' (1, 17) (1, 21)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300641 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400642
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300643 self.check_tokenize("class async:pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400644 NAME 'class' (1, 0) (1, 5)
645 NAME 'async' (1, 6) (1, 11)
646 OP ':' (1, 11) (1, 12)
647 NAME 'pass' (1, 12) (1, 16)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300648 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400649
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300650 self.check_tokenize("await = 1", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400651 NAME 'await' (1, 0) (1, 5)
652 OP '=' (1, 6) (1, 7)
653 NUMBER '1' (1, 8) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300654 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400655
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300656 self.check_tokenize("foo.async", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400657 NAME 'foo' (1, 0) (1, 3)
658 OP '.' (1, 3) (1, 4)
659 NAME 'async' (1, 4) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300660 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400661
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300662 self.check_tokenize("async for a in b: pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400663 NAME 'async' (1, 0) (1, 5)
664 NAME 'for' (1, 6) (1, 9)
665 NAME 'a' (1, 10) (1, 11)
666 NAME 'in' (1, 12) (1, 14)
667 NAME 'b' (1, 15) (1, 16)
668 OP ':' (1, 16) (1, 17)
669 NAME 'pass' (1, 18) (1, 22)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300670 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400671
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300672 self.check_tokenize("async with a as b: pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400673 NAME 'async' (1, 0) (1, 5)
674 NAME 'with' (1, 6) (1, 10)
675 NAME 'a' (1, 11) (1, 12)
676 NAME 'as' (1, 13) (1, 15)
677 NAME 'b' (1, 16) (1, 17)
678 OP ':' (1, 17) (1, 18)
679 NAME 'pass' (1, 19) (1, 23)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300680 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400681
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300682 self.check_tokenize("async.foo", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400683 NAME 'async' (1, 0) (1, 5)
684 OP '.' (1, 5) (1, 6)
685 NAME 'foo' (1, 6) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300686 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400687
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300688 self.check_tokenize("async", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400689 NAME 'async' (1, 0) (1, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300690 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400691
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300692 self.check_tokenize("async\n#comment\nawait", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400693 NAME 'async' (1, 0) (1, 5)
694 NEWLINE '\\n' (1, 5) (1, 6)
695 COMMENT '#comment' (2, 0) (2, 8)
696 NL '\\n' (2, 8) (2, 9)
697 NAME 'await' (3, 0) (3, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300698 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400699
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300700 self.check_tokenize("async\n...\nawait", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400701 NAME 'async' (1, 0) (1, 5)
702 NEWLINE '\\n' (1, 5) (1, 6)
703 OP '...' (2, 0) (2, 3)
704 NEWLINE '\\n' (2, 3) (2, 4)
705 NAME 'await' (3, 0) (3, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300706 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400707
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300708 self.check_tokenize("async\nawait", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400709 NAME 'async' (1, 0) (1, 5)
710 NEWLINE '\\n' (1, 5) (1, 6)
711 NAME 'await' (2, 0) (2, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300712 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400713
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300714 self.check_tokenize("foo.async + 1", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400715 NAME 'foo' (1, 0) (1, 3)
716 OP '.' (1, 3) (1, 4)
717 NAME 'async' (1, 4) (1, 9)
718 OP '+' (1, 10) (1, 11)
719 NUMBER '1' (1, 12) (1, 13)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300720 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400721
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300722 self.check_tokenize("async def foo(): pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400723 ASYNC 'async' (1, 0) (1, 5)
724 NAME 'def' (1, 6) (1, 9)
725 NAME 'foo' (1, 10) (1, 13)
726 OP '(' (1, 13) (1, 14)
727 OP ')' (1, 14) (1, 15)
728 OP ':' (1, 15) (1, 16)
729 NAME 'pass' (1, 17) (1, 21)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300730 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400731
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300732 self.check_tokenize('''\
733async def foo():
734 def foo(await):
735 await = 1
736 if 1:
737 await
738async += 1
739''', """\
Yury Selivanov75445082015-05-11 22:57:16 -0400740 ASYNC 'async' (1, 0) (1, 5)
741 NAME 'def' (1, 6) (1, 9)
742 NAME 'foo' (1, 10) (1, 13)
743 OP '(' (1, 13) (1, 14)
744 OP ')' (1, 14) (1, 15)
745 OP ':' (1, 15) (1, 16)
746 NEWLINE '\\n' (1, 16) (1, 17)
747 INDENT ' ' (2, 0) (2, 2)
748 NAME 'def' (2, 2) (2, 5)
749 NAME 'foo' (2, 6) (2, 9)
750 OP '(' (2, 9) (2, 10)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300751 AWAIT 'await' (2, 10) (2, 15)
Yury Selivanov75445082015-05-11 22:57:16 -0400752 OP ')' (2, 15) (2, 16)
753 OP ':' (2, 16) (2, 17)
754 NEWLINE '\\n' (2, 17) (2, 18)
755 INDENT ' ' (3, 0) (3, 4)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300756 AWAIT 'await' (3, 4) (3, 9)
Yury Selivanov75445082015-05-11 22:57:16 -0400757 OP '=' (3, 10) (3, 11)
758 NUMBER '1' (3, 12) (3, 13)
759 NEWLINE '\\n' (3, 13) (3, 14)
760 DEDENT '' (4, 2) (4, 2)
761 NAME 'if' (4, 2) (4, 4)
762 NUMBER '1' (4, 5) (4, 6)
763 OP ':' (4, 6) (4, 7)
764 NEWLINE '\\n' (4, 7) (4, 8)
765 INDENT ' ' (5, 0) (5, 4)
766 AWAIT 'await' (5, 4) (5, 9)
767 NEWLINE '\\n' (5, 9) (5, 10)
768 DEDENT '' (6, 0) (6, 0)
769 DEDENT '' (6, 0) (6, 0)
770 NAME 'async' (6, 0) (6, 5)
771 OP '+=' (6, 6) (6, 8)
772 NUMBER '1' (6, 9) (6, 10)
773 NEWLINE '\\n' (6, 10) (6, 11)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300774 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400775
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300776 self.check_tokenize('''\
777async def foo():
778 async for i in 1: pass''', """\
Yury Selivanov75445082015-05-11 22:57:16 -0400779 ASYNC 'async' (1, 0) (1, 5)
780 NAME 'def' (1, 6) (1, 9)
781 NAME 'foo' (1, 10) (1, 13)
782 OP '(' (1, 13) (1, 14)
783 OP ')' (1, 14) (1, 15)
784 OP ':' (1, 15) (1, 16)
785 NEWLINE '\\n' (1, 16) (1, 17)
786 INDENT ' ' (2, 0) (2, 2)
787 ASYNC 'async' (2, 2) (2, 7)
788 NAME 'for' (2, 8) (2, 11)
789 NAME 'i' (2, 12) (2, 13)
790 NAME 'in' (2, 14) (2, 16)
791 NUMBER '1' (2, 17) (2, 18)
792 OP ':' (2, 18) (2, 19)
793 NAME 'pass' (2, 20) (2, 24)
794 DEDENT '' (3, 0) (3, 0)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300795 """)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300796
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300797 self.check_tokenize('''async def foo(async): await''', """\
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300798 ASYNC 'async' (1, 0) (1, 5)
799 NAME 'def' (1, 6) (1, 9)
800 NAME 'foo' (1, 10) (1, 13)
801 OP '(' (1, 13) (1, 14)
802 ASYNC 'async' (1, 14) (1, 19)
803 OP ')' (1, 19) (1, 20)
804 OP ':' (1, 20) (1, 21)
805 AWAIT 'await' (1, 22) (1, 27)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300806 """)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300807
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300808 self.check_tokenize('''\
809def f():
810
811 def baz(): pass
812 async def bar(): pass
813
814 await = 2''', """\
Yury Selivanov96ec9342015-07-23 15:01:58 +0300815 NAME 'def' (1, 0) (1, 3)
816 NAME 'f' (1, 4) (1, 5)
817 OP '(' (1, 5) (1, 6)
818 OP ')' (1, 6) (1, 7)
819 OP ':' (1, 7) (1, 8)
820 NEWLINE '\\n' (1, 8) (1, 9)
821 NL '\\n' (2, 0) (2, 1)
822 INDENT ' ' (3, 0) (3, 2)
823 NAME 'def' (3, 2) (3, 5)
824 NAME 'baz' (3, 6) (3, 9)
825 OP '(' (3, 9) (3, 10)
826 OP ')' (3, 10) (3, 11)
827 OP ':' (3, 11) (3, 12)
828 NAME 'pass' (3, 13) (3, 17)
829 NEWLINE '\\n' (3, 17) (3, 18)
830 ASYNC 'async' (4, 2) (4, 7)
831 NAME 'def' (4, 8) (4, 11)
832 NAME 'bar' (4, 12) (4, 15)
833 OP '(' (4, 15) (4, 16)
834 OP ')' (4, 16) (4, 17)
835 OP ':' (4, 17) (4, 18)
836 NAME 'pass' (4, 19) (4, 23)
837 NEWLINE '\\n' (4, 23) (4, 24)
838 NL '\\n' (5, 0) (5, 1)
839 NAME 'await' (6, 2) (6, 7)
840 OP '=' (6, 8) (6, 9)
841 NUMBER '2' (6, 10) (6, 11)
842 DEDENT '' (7, 0) (7, 0)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300843 """)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300844
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300845 self.check_tokenize('''\
846async def f():
847
848 def baz(): pass
849 async def bar(): pass
850
851 await = 2''', """\
Yury Selivanov96ec9342015-07-23 15:01:58 +0300852 ASYNC 'async' (1, 0) (1, 5)
853 NAME 'def' (1, 6) (1, 9)
854 NAME 'f' (1, 10) (1, 11)
855 OP '(' (1, 11) (1, 12)
856 OP ')' (1, 12) (1, 13)
857 OP ':' (1, 13) (1, 14)
858 NEWLINE '\\n' (1, 14) (1, 15)
859 NL '\\n' (2, 0) (2, 1)
860 INDENT ' ' (3, 0) (3, 2)
861 NAME 'def' (3, 2) (3, 5)
862 NAME 'baz' (3, 6) (3, 9)
863 OP '(' (3, 9) (3, 10)
864 OP ')' (3, 10) (3, 11)
865 OP ':' (3, 11) (3, 12)
866 NAME 'pass' (3, 13) (3, 17)
867 NEWLINE '\\n' (3, 17) (3, 18)
868 ASYNC 'async' (4, 2) (4, 7)
869 NAME 'def' (4, 8) (4, 11)
870 NAME 'bar' (4, 12) (4, 15)
871 OP '(' (4, 15) (4, 16)
872 OP ')' (4, 16) (4, 17)
873 OP ':' (4, 17) (4, 18)
874 NAME 'pass' (4, 19) (4, 23)
875 NEWLINE '\\n' (4, 23) (4, 24)
876 NL '\\n' (5, 0) (5, 1)
877 AWAIT 'await' (6, 2) (6, 7)
878 OP '=' (6, 8) (6, 9)
879 NUMBER '2' (6, 10) (6, 11)
880 DEDENT '' (7, 0) (7, 0)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300881 """)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000882
Raymond Hettinger68c04532005-06-10 11:05:19 +0000883
Raymond Hettinger68c04532005-06-10 11:05:19 +0000884def decistmt(s):
Raymond Hettinger68c04532005-06-10 11:05:19 +0000885 result = []
Trent Nelson428de652008-03-18 22:41:35 +0000886 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
Raymond Hettinger68c04532005-06-10 11:05:19 +0000887 for toknum, tokval, _, _, _ in g:
888 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
889 result.extend([
890 (NAME, 'Decimal'),
891 (OP, '('),
892 (STRING, repr(tokval)),
893 (OP, ')')
894 ])
895 else:
896 result.append((toknum, tokval))
Trent Nelson428de652008-03-18 22:41:35 +0000897 return untokenize(result).decode('utf-8')
898
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300899class TestMisc(TestCase):
900
901 def test_decistmt(self):
902 # Substitute Decimals for floats in a string of statements.
903 # This is an example from the docs.
904
905 from decimal import Decimal
906 s = '+21.3e-5*-.1234/81.7'
907 self.assertEqual(decistmt(s),
908 "+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')")
909
910 # The format of the exponent is inherited from the platform C library.
911 # Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
912 # we're only showing 11 digits, and the 12th isn't close to 5, the
913 # rest of the output should be platform-independent.
914 self.assertRegex(repr(eval(s)), '-3.2171603427[0-9]*e-0+7')
915
916 # Output from calculations with Decimal should be identical across all
917 # platforms.
918 self.assertEqual(eval(decistmt(s)),
919 Decimal('-3.217160342717258261933904529E-7'))
920
Trent Nelson428de652008-03-18 22:41:35 +0000921
922class TestTokenizerAdheresToPep0263(TestCase):
923 """
924 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
925 """
926
927 def _testFile(self, filename):
928 path = os.path.join(os.path.dirname(__file__), filename)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300929 TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
Trent Nelson428de652008-03-18 22:41:35 +0000930
931 def test_utf8_coding_cookie_and_no_utf8_bom(self):
Ned Deily2ea6fcc2011-07-19 16:15:27 -0700932 f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300933 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +0000934
935 def test_latin1_coding_cookie_and_utf8_bom(self):
936 """
937 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
938 allowed encoding for the comment is 'utf-8'. The text file used in
939 this test starts with a BOM signature, but specifies latin1 as the
940 coding, so verify that a SyntaxError is raised, which matches the
941 behaviour of the interpreter when it encounters a similar condition.
942 """
943 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000944 self.assertRaises(SyntaxError, self._testFile, f)
Trent Nelson428de652008-03-18 22:41:35 +0000945
946 def test_no_coding_cookie_and_utf8_bom(self):
947 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300948 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +0000949
950 def test_utf8_coding_cookie_and_utf8_bom(self):
951 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300952 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +0000953
Florent Xicluna11f0b412012-07-07 12:13:35 +0200954 def test_bad_coding_cookie(self):
955 self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')
956 self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')
957
Trent Nelson428de652008-03-18 22:41:35 +0000958
959class Test_Tokenize(TestCase):
960
961 def test__tokenize_decodes_with_specified_encoding(self):
962 literal = '"ЉЊЈЁЂ"'
963 line = literal.encode('utf-8')
964 first = False
965 def readline():
966 nonlocal first
967 if not first:
968 first = True
969 return line
970 else:
971 return b''
972
973 # skip the initial encoding token and the end token
974 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
975 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000976 self.assertEqual(tokens, expected_tokens,
977 "bytes not decoded with encoding")
Trent Nelson428de652008-03-18 22:41:35 +0000978
979 def test__tokenize_does_not_decode_with_encoding_none(self):
980 literal = '"ЉЊЈЁЂ"'
981 first = False
982 def readline():
983 nonlocal first
984 if not first:
985 first = True
986 return literal
987 else:
988 return b''
989
990 # skip the end token
991 tokens = list(_tokenize(readline, encoding=None))[:-1]
992 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000993 self.assertEqual(tokens, expected_tokens,
994 "string not tokenized when encoding is None")
Trent Nelson428de652008-03-18 22:41:35 +0000995
996
997class TestDetectEncoding(TestCase):
998
999 def get_readline(self, lines):
1000 index = 0
1001 def readline():
1002 nonlocal index
1003 if index == len(lines):
1004 raise StopIteration
1005 line = lines[index]
1006 index += 1
1007 return line
1008 return readline
1009
1010 def test_no_bom_no_encoding_cookie(self):
1011 lines = (
1012 b'# something\n',
1013 b'print(something)\n',
1014 b'do_something(else)\n'
1015 )
1016 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001017 self.assertEqual(encoding, 'utf-8')
1018 self.assertEqual(consumed_lines, list(lines[:2]))
Trent Nelson428de652008-03-18 22:41:35 +00001019
1020 def test_bom_no_cookie(self):
1021 lines = (
1022 b'\xef\xbb\xbf# something\n',
1023 b'print(something)\n',
1024 b'do_something(else)\n'
1025 )
1026 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001027 self.assertEqual(encoding, 'utf-8-sig')
1028 self.assertEqual(consumed_lines,
1029 [b'# something\n', b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001030
1031 def test_cookie_first_line_no_bom(self):
1032 lines = (
1033 b'# -*- coding: latin-1 -*-\n',
1034 b'print(something)\n',
1035 b'do_something(else)\n'
1036 )
1037 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001038 self.assertEqual(encoding, 'iso-8859-1')
1039 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001040
1041 def test_matched_bom_and_cookie_first_line(self):
1042 lines = (
1043 b'\xef\xbb\xbf# coding=utf-8\n',
1044 b'print(something)\n',
1045 b'do_something(else)\n'
1046 )
1047 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001048 self.assertEqual(encoding, 'utf-8-sig')
1049 self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001050
1051 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
1052 lines = (
1053 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
1054 b'print(something)\n',
1055 b'do_something(else)\n'
1056 )
1057 readline = self.get_readline(lines)
1058 self.assertRaises(SyntaxError, detect_encoding, readline)
1059
1060 def test_cookie_second_line_no_bom(self):
1061 lines = (
1062 b'#! something\n',
1063 b'# vim: set fileencoding=ascii :\n',
1064 b'print(something)\n',
1065 b'do_something(else)\n'
1066 )
1067 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001068 self.assertEqual(encoding, 'ascii')
Trent Nelson428de652008-03-18 22:41:35 +00001069 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
Ezio Melottib3aedd42010-11-20 19:04:17 +00001070 self.assertEqual(consumed_lines, expected)
Trent Nelson428de652008-03-18 22:41:35 +00001071
1072 def test_matched_bom_and_cookie_second_line(self):
1073 lines = (
1074 b'\xef\xbb\xbf#! something\n',
1075 b'f# coding=utf-8\n',
1076 b'print(something)\n',
1077 b'do_something(else)\n'
1078 )
1079 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001080 self.assertEqual(encoding, 'utf-8-sig')
1081 self.assertEqual(consumed_lines,
1082 [b'#! something\n', b'f# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001083
1084 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
1085 lines = (
1086 b'\xef\xbb\xbf#! something\n',
1087 b'# vim: set fileencoding=ascii :\n',
1088 b'print(something)\n',
1089 b'do_something(else)\n'
1090 )
1091 readline = self.get_readline(lines)
1092 self.assertRaises(SyntaxError, detect_encoding, readline)
1093
Serhiy Storchaka768c16c2014-01-09 18:36:09 +02001094 def test_cookie_second_line_noncommented_first_line(self):
1095 lines = (
1096 b"print('\xc2\xa3')\n",
1097 b'# vim: set fileencoding=iso8859-15 :\n',
1098 b"print('\xe2\x82\xac')\n"
1099 )
1100 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1101 self.assertEqual(encoding, 'utf-8')
1102 expected = [b"print('\xc2\xa3')\n"]
1103 self.assertEqual(consumed_lines, expected)
1104
1105 def test_cookie_second_line_commented_first_line(self):
1106 lines = (
1107 b"#print('\xc2\xa3')\n",
1108 b'# vim: set fileencoding=iso8859-15 :\n',
1109 b"print('\xe2\x82\xac')\n"
1110 )
1111 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1112 self.assertEqual(encoding, 'iso8859-15')
1113 expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
1114 self.assertEqual(consumed_lines, expected)
1115
1116 def test_cookie_second_line_empty_first_line(self):
1117 lines = (
1118 b'\n',
1119 b'# vim: set fileencoding=iso8859-15 :\n',
1120 b"print('\xe2\x82\xac')\n"
1121 )
1122 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1123 self.assertEqual(encoding, 'iso8859-15')
1124 expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
1125 self.assertEqual(consumed_lines, expected)
1126
Benjamin Petersond3afada2009-10-09 21:43:09 +00001127 def test_latin1_normalization(self):
1128 # See get_normal_name() in tokenizer.c.
1129 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
1130 "iso-8859-1-unix", "iso-latin-1-mac")
1131 for encoding in encodings:
1132 for rep in ("-", "_"):
1133 enc = encoding.replace("-", rep)
1134 lines = (b"#!/usr/bin/python\n",
1135 b"# coding: " + enc.encode("ascii") + b"\n",
1136 b"print(things)\n",
1137 b"do_something += 4\n")
1138 rl = self.get_readline(lines)
1139 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001140 self.assertEqual(found, "iso-8859-1")
Benjamin Petersond3afada2009-10-09 21:43:09 +00001141
Martin v. Löwis63674f42012-04-20 14:36:47 +02001142 def test_syntaxerror_latin1(self):
1143 # Issue 14629: need to raise SyntaxError if the first
1144 # line(s) have non-UTF-8 characters
1145 lines = (
1146 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1147 )
1148 readline = self.get_readline(lines)
1149 self.assertRaises(SyntaxError, detect_encoding, readline)
1150
1151
Benjamin Petersond3afada2009-10-09 21:43:09 +00001152 def test_utf8_normalization(self):
1153 # See get_normal_name() in tokenizer.c.
1154 encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
1155 for encoding in encodings:
1156 for rep in ("-", "_"):
1157 enc = encoding.replace("-", rep)
1158 lines = (b"#!/usr/bin/python\n",
1159 b"# coding: " + enc.encode("ascii") + b"\n",
1160 b"1 + 3\n")
1161 rl = self.get_readline(lines)
1162 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001163 self.assertEqual(found, "utf-8")
Benjamin Petersond3afada2009-10-09 21:43:09 +00001164
Trent Nelson428de652008-03-18 22:41:35 +00001165 def test_short_files(self):
1166 readline = self.get_readline((b'print(something)\n',))
1167 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001168 self.assertEqual(encoding, 'utf-8')
1169 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001170
1171 encoding, consumed_lines = detect_encoding(self.get_readline(()))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001172 self.assertEqual(encoding, 'utf-8')
1173 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +00001174
1175 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
1176 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001177 self.assertEqual(encoding, 'utf-8-sig')
1178 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001179
1180 readline = self.get_readline((b'\xef\xbb\xbf',))
1181 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001182 self.assertEqual(encoding, 'utf-8-sig')
1183 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +00001184
Benjamin Peterson433f32c2008-12-12 01:25:05 +00001185 readline = self.get_readline((b'# coding: bad\n',))
1186 self.assertRaises(SyntaxError, detect_encoding, readline)
Trent Nelson428de652008-03-18 22:41:35 +00001187
Serhiy Storchakadafea852013-09-16 23:51:56 +03001188 def test_false_encoding(self):
1189 # Issue 18873: "Encoding" detected in non-comment lines
1190 readline = self.get_readline((b'print("#coding=fake")',))
1191 encoding, consumed_lines = detect_encoding(readline)
1192 self.assertEqual(encoding, 'utf-8')
1193 self.assertEqual(consumed_lines, [b'print("#coding=fake")'])
1194
Victor Stinner58c07522010-11-09 01:08:59 +00001195 def test_open(self):
1196 filename = support.TESTFN + '.py'
1197 self.addCleanup(support.unlink, filename)
1198
1199 # test coding cookie
1200 for encoding in ('iso-8859-15', 'utf-8'):
1201 with open(filename, 'w', encoding=encoding) as fp:
1202 print("# coding: %s" % encoding, file=fp)
1203 print("print('euro:\u20ac')", file=fp)
1204 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001205 self.assertEqual(fp.encoding, encoding)
1206 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001207
1208 # test BOM (no coding cookie)
1209 with open(filename, 'w', encoding='utf-8-sig') as fp:
1210 print("print('euro:\u20ac')", file=fp)
1211 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001212 self.assertEqual(fp.encoding, 'utf-8-sig')
1213 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001214
Brett Cannonc33f3f22012-04-20 13:23:54 -04001215 def test_filename_in_exception(self):
1216 # When possible, include the file name in the exception.
1217 path = 'some_file_path'
1218 lines = (
1219 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1220 )
1221 class Bunk:
1222 def __init__(self, lines, path):
1223 self.name = path
1224 self._lines = lines
1225 self._index = 0
1226
1227 def readline(self):
1228 if self._index == len(lines):
1229 raise StopIteration
1230 line = lines[self._index]
1231 self._index += 1
1232 return line
1233
1234 with self.assertRaises(SyntaxError):
1235 ins = Bunk(lines, path)
1236 # Make sure lacking a name isn't an issue.
1237 del ins.name
1238 detect_encoding(ins.readline)
1239 with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
1240 ins = Bunk(lines, path)
1241 detect_encoding(ins.readline)
1242
Victor Stinner387729e2015-05-26 00:43:58 +02001243 def test_open_error(self):
1244 # Issue #23840: open() must close the binary file on error
1245 m = BytesIO(b'#coding:xxx')
1246 with mock.patch('tokenize._builtin_open', return_value=m):
1247 self.assertRaises(SyntaxError, tokenize_open, 'foobar')
1248 self.assertTrue(m.closed)
1249
1250
Trent Nelson428de652008-03-18 22:41:35 +00001251class TestTokenize(TestCase):
1252
1253 def test_tokenize(self):
1254 import tokenize as tokenize_module
1255 encoding = object()
1256 encoding_used = None
1257 def mock_detect_encoding(readline):
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001258 return encoding, [b'first', b'second']
Trent Nelson428de652008-03-18 22:41:35 +00001259
1260 def mock__tokenize(readline, encoding):
1261 nonlocal encoding_used
1262 encoding_used = encoding
1263 out = []
1264 while True:
1265 next_line = readline()
1266 if next_line:
1267 out.append(next_line)
1268 continue
1269 return out
1270
1271 counter = 0
1272 def mock_readline():
1273 nonlocal counter
1274 counter += 1
1275 if counter == 5:
1276 return b''
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001277 return str(counter).encode()
Trent Nelson428de652008-03-18 22:41:35 +00001278
1279 orig_detect_encoding = tokenize_module.detect_encoding
1280 orig__tokenize = tokenize_module._tokenize
1281 tokenize_module.detect_encoding = mock_detect_encoding
1282 tokenize_module._tokenize = mock__tokenize
1283 try:
1284 results = tokenize(mock_readline)
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001285 self.assertEqual(list(results),
1286 [b'first', b'second', b'1', b'2', b'3', b'4'])
Trent Nelson428de652008-03-18 22:41:35 +00001287 finally:
1288 tokenize_module.detect_encoding = orig_detect_encoding
1289 tokenize_module._tokenize = orig__tokenize
1290
1291 self.assertTrue(encoding_used, encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +00001292
Yury Selivanov8085b802015-05-18 12:50:52 -04001293 def test_oneline_defs(self):
1294 buf = []
1295 for i in range(500):
1296 buf.append('def i{i}(): return {i}'.format(i=i))
1297 buf.append('OK')
1298 buf = '\n'.join(buf)
1299
1300 # Test that 500 consequent, one-line defs is OK
1301 toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
1302 self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER
1303
Meador Inge00c7f852012-01-19 00:44:45 -06001304 def assertExactTypeEqual(self, opstr, *optypes):
1305 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
1306 num_optypes = len(optypes)
1307 self.assertEqual(len(tokens), 2 + num_optypes)
1308 self.assertEqual(token.tok_name[tokens[0].exact_type],
1309 token.tok_name[ENCODING])
1310 for i in range(num_optypes):
1311 self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
1312 token.tok_name[optypes[i]])
1313 self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
1314 token.tok_name[token.ENDMARKER])
1315
1316 def test_exact_type(self):
1317 self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
1318 self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
1319 self.assertExactTypeEqual(':', token.COLON)
1320 self.assertExactTypeEqual(',', token.COMMA)
1321 self.assertExactTypeEqual(';', token.SEMI)
1322 self.assertExactTypeEqual('+', token.PLUS)
1323 self.assertExactTypeEqual('-', token.MINUS)
1324 self.assertExactTypeEqual('*', token.STAR)
1325 self.assertExactTypeEqual('/', token.SLASH)
1326 self.assertExactTypeEqual('|', token.VBAR)
1327 self.assertExactTypeEqual('&', token.AMPER)
1328 self.assertExactTypeEqual('<', token.LESS)
1329 self.assertExactTypeEqual('>', token.GREATER)
1330 self.assertExactTypeEqual('=', token.EQUAL)
1331 self.assertExactTypeEqual('.', token.DOT)
1332 self.assertExactTypeEqual('%', token.PERCENT)
1333 self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
1334 self.assertExactTypeEqual('==', token.EQEQUAL)
1335 self.assertExactTypeEqual('!=', token.NOTEQUAL)
1336 self.assertExactTypeEqual('<=', token.LESSEQUAL)
1337 self.assertExactTypeEqual('>=', token.GREATEREQUAL)
1338 self.assertExactTypeEqual('~', token.TILDE)
1339 self.assertExactTypeEqual('^', token.CIRCUMFLEX)
1340 self.assertExactTypeEqual('<<', token.LEFTSHIFT)
1341 self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
1342 self.assertExactTypeEqual('**', token.DOUBLESTAR)
1343 self.assertExactTypeEqual('+=', token.PLUSEQUAL)
1344 self.assertExactTypeEqual('-=', token.MINEQUAL)
1345 self.assertExactTypeEqual('*=', token.STAREQUAL)
1346 self.assertExactTypeEqual('/=', token.SLASHEQUAL)
1347 self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
1348 self.assertExactTypeEqual('&=', token.AMPEREQUAL)
1349 self.assertExactTypeEqual('|=', token.VBAREQUAL)
1350 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1351 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1352 self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
1353 self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
1354 self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
1355 self.assertExactTypeEqual('//', token.DOUBLESLASH)
1356 self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
1357 self.assertExactTypeEqual('@', token.AT)
Benjamin Petersond51374e2014-04-09 23:55:56 -04001358 self.assertExactTypeEqual('@=', token.ATEQUAL)
Meador Inge00c7f852012-01-19 00:44:45 -06001359
1360 self.assertExactTypeEqual('a**2+b**2==c**2',
1361 NAME, token.DOUBLESTAR, NUMBER,
1362 token.PLUS,
1363 NAME, token.DOUBLESTAR, NUMBER,
1364 token.EQEQUAL,
1365 NAME, token.DOUBLESTAR, NUMBER)
1366 self.assertExactTypeEqual('{1, 2, 3}',
1367 token.LBRACE,
1368 token.NUMBER, token.COMMA,
1369 token.NUMBER, token.COMMA,
1370 token.NUMBER,
1371 token.RBRACE)
1372 self.assertExactTypeEqual('^(x & 0x1)',
1373 token.CIRCUMFLEX,
1374 token.LPAR,
1375 token.NAME, token.AMPER, token.NUMBER,
1376 token.RPAR)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001377
Ezio Melottifafa8b72012-11-03 17:46:51 +02001378 def test_pathological_trailing_whitespace(self):
1379 # See http://bugs.python.org/issue16152
1380 self.assertExactTypeEqual('@ ', token.AT)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001381
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001382
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001383class UntokenizeTest(TestCase):
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001384
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001385 def test_bad_input_order(self):
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001386 # raise if previous row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001387 u = Untokenizer()
1388 u.prev_row = 2
1389 u.prev_col = 2
1390 with self.assertRaises(ValueError) as cm:
1391 u.add_whitespace((1,3))
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001392 self.assertEqual(cm.exception.args[0],
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001393 'start (1,3) precedes previous end (2,2)')
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001394 # raise if previous column in row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001395 self.assertRaises(ValueError, u.add_whitespace, (2,1))
1396
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001397 def test_backslash_continuation(self):
1398 # The problem is that <whitespace>\<newline> leaves no token
1399 u = Untokenizer()
1400 u.prev_row = 1
1401 u.prev_col = 1
1402 u.tokens = []
1403 u.add_whitespace((2, 0))
1404 self.assertEqual(u.tokens, ['\\\n'])
1405 u.prev_row = 2
1406 u.add_whitespace((4, 4))
1407 self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' '])
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001408 TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n')
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001409
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -05001410 def test_iter_compat(self):
1411 u = Untokenizer()
1412 token = (NAME, 'Hello')
1413 tokens = [(ENCODING, 'utf-8'), token]
1414 u.compat(token, iter([]))
1415 self.assertEqual(u.tokens, ["Hello "])
1416 u = Untokenizer()
1417 self.assertEqual(u.untokenize(iter([token])), 'Hello ')
1418 u = Untokenizer()
1419 self.assertEqual(u.untokenize(iter(tokens)), 'Hello ')
1420 self.assertEqual(u.encoding, 'utf-8')
1421 self.assertEqual(untokenize(iter(tokens)), b'Hello ')
1422
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001423
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001424class TestRoundtrip(TestCase):
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001425
1426 def check_roundtrip(self, f):
1427 """
1428 Test roundtrip for `untokenize`. `f` is an open file or a string.
1429 The source code in f is tokenized to both 5- and 2-tuples.
1430 Both sequences are converted back to source code via
1431 tokenize.untokenize(), and the latter tokenized again to 2-tuples.
1432 The test fails if the 3 pair tokenizations do not match.
1433
1434 When untokenize bugs are fixed, untokenize with 5-tuples should
1435 reproduce code that does not contain a backslash continuation
1436 following spaces. A proper test should test this.
1437 """
1438 # Get source code and original tokenizations
1439 if isinstance(f, str):
1440 code = f.encode('utf-8')
1441 else:
1442 code = f.read()
1443 f.close()
1444 readline = iter(code.splitlines(keepends=True)).__next__
1445 tokens5 = list(tokenize(readline))
1446 tokens2 = [tok[:2] for tok in tokens5]
1447 # Reproduce tokens2 from pairs
1448 bytes_from2 = untokenize(tokens2)
1449 readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__
1450 tokens2_from2 = [tok[:2] for tok in tokenize(readline2)]
1451 self.assertEqual(tokens2_from2, tokens2)
1452 # Reproduce tokens2 from 5-tuples
1453 bytes_from5 = untokenize(tokens5)
1454 readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__
1455 tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
1456 self.assertEqual(tokens2_from5, tokens2)
1457
1458 def test_roundtrip(self):
1459 # There are some standard formatting practices that are easy to get right.
1460
1461 self.check_roundtrip("if x == 1:\n"
1462 " print(x)\n")
1463 self.check_roundtrip("# This is a comment\n"
1464 "# This also")
1465
1466 # Some people use different formatting conventions, which makes
1467 # untokenize a little trickier. Note that this test involves trailing
1468 # whitespace after the colon. Note that we use hex escapes to make the
1469 # two trailing blanks apparent in the expected output.
1470
1471 self.check_roundtrip("if x == 1 : \n"
1472 " print(x)\n")
1473 fn = support.findfile("tokenize_tests.txt")
1474 with open(fn, 'rb') as f:
1475 self.check_roundtrip(f)
1476 self.check_roundtrip("if x == 1:\n"
1477 " # A comment by itself.\n"
1478 " print(x) # Comment here, too.\n"
1479 " # Another comment.\n"
1480 "after_if = True\n")
1481 self.check_roundtrip("if (x # The comments need to go in the right place\n"
1482 " == 1):\n"
1483 " print('x==1')\n")
1484 self.check_roundtrip("class Test: # A comment here\n"
1485 " # A comment with weird indent\n"
1486 " after_com = 5\n"
1487 " def x(m): return m*5 # a one liner\n"
1488 " def y(m): # A whitespace after the colon\n"
1489 " return y*4 # 3-space indent\n")
1490
1491 # Some error-handling code
1492 self.check_roundtrip("try: import somemodule\n"
1493 "except ImportError: # comment\n"
1494 " print('Can not import' # comment2\n)"
1495 "else: print('Loaded')\n")
1496
1497 def test_continuation(self):
1498 # Balancing continuation
1499 self.check_roundtrip("a = (3,4, \n"
1500 "5,6)\n"
1501 "y = [3, 4,\n"
1502 "5]\n"
1503 "z = {'a': 5,\n"
1504 "'b':15, 'c':True}\n"
1505 "x = len(y) + 5 - a[\n"
1506 "3] - a[2]\n"
1507 "+ len(z) - z[\n"
1508 "'b']\n")
1509
1510 def test_backslash_continuation(self):
1511 # Backslash means line continuation, except for comments
1512 self.check_roundtrip("x=1+\\\n"
1513 "1\n"
1514 "# This is a comment\\\n"
1515 "# This also\n")
1516 self.check_roundtrip("# Comment \\\n"
1517 "x = 0")
1518
1519 def test_string_concatenation(self):
1520 # Two string literals on the same line
1521 self.check_roundtrip("'' ''")
1522
1523 def test_random_files(self):
1524 # Test roundtrip on random python modules.
1525 # pass the '-ucpu' option to process the full directory.
1526
1527 import glob, random
1528 fn = support.findfile("tokenize_tests.txt")
1529 tempdir = os.path.dirname(fn) or os.curdir
1530 testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
1531
1532 # Tokenize is broken on test_pep3131.py because regular expressions are
1533 # broken on the obscure unicode identifiers in it. *sigh*
1534 # With roundtrip extended to test the 5-tuple mode of untokenize,
1535 # 7 more testfiles fail. Remove them also until the failure is diagnosed.
1536
1537 testfiles.remove(os.path.join(tempdir, "test_pep3131.py"))
1538 for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'):
1539 testfiles.remove(os.path.join(tempdir, "test_%s.py") % f)
1540
1541 if not support.is_resource_enabled("cpu"):
1542 testfiles = random.sample(testfiles, 10)
1543
1544 for testfile in testfiles:
1545 with open(testfile, 'rb') as f:
1546 with self.subTest(file=testfile):
1547 self.check_roundtrip(f)
1548
1549
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001550 def roundtrip(self, code):
1551 if isinstance(code, str):
1552 code = code.encode('utf-8')
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001553 return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001554
1555 def test_indentation_semantics_retained(self):
1556 """
1557 Ensure that although whitespace might be mutated in a roundtrip,
1558 the semantic meaning of the indentation remains consistent.
1559 """
1560 code = "if False:\n\tx=3\n\tx=3\n"
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001561 codelines = self.roundtrip(code).split('\n')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001562 self.assertEqual(codelines[1], codelines[2])
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001563 self.check_roundtrip(code)
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001564
1565
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001566if __name__ == "__main__":
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001567 unittest.main()