blob: dcaf58f5272bacfab5ed754074d0e8116e8ca362 [file] [log] [blame]
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001from test import support
2from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
3 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
4 open as tokenize_open, Untokenizer)
5from io import BytesIO
Brett Cannona721aba2016-09-09 14:57:09 -07006from unittest import TestCase, mock
7from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
8 INVALID_UNDERSCORE_LITERALS)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03009import os
10import token
Thomas Wouters89f507f2006-12-13 04:49:30 +000011
Thomas Wouters89f507f2006-12-13 04:49:30 +000012
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030013class TokenizeTest(TestCase):
14 # Tests for the tokenize module.
Jason R. Coombs7cf36382015-06-20 19:13:50 -040015
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030016 # The tests can be really simple. Given a small fragment of source
17 # code, print out a table with tokens. The ENDMARKER is omitted for
18 # brevity.
19
20 def check_tokenize(self, s, expected):
21 # Format the tokens in s in a table format.
22 # The ENDMARKER is omitted.
23 result = []
24 f = BytesIO(s.encode('utf-8'))
25 for type, token, start, end, line in tokenize(f.readline):
26 if type == ENDMARKER:
27 break
28 type = tok_name[type]
Eric V. Smith67317742015-10-16 20:45:53 -040029 result.append(f" {type:10} {token!r:13} {start} {end}")
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030030 self.assertEqual(result,
31 [" ENCODING 'utf-8' (0, 0) (0, 0)"] +
32 expected.rstrip().splitlines())
33
34 def test_basic(self):
35 self.check_tokenize("1 + 1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000036 NUMBER '1' (1, 0) (1, 1)
37 OP '+' (1, 2) (1, 3)
38 NUMBER '1' (1, 4) (1, 5)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030039 """)
40 self.check_tokenize("if False:\n"
41 " # NL\n"
Albert-Jan Nijburgc471ca42017-05-24 12:31:57 +010042 " \n"
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030043 " True = False # NEWLINE\n", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000044 NAME 'if' (1, 0) (1, 2)
45 NAME 'False' (1, 3) (1, 8)
46 OP ':' (1, 8) (1, 9)
47 NEWLINE '\\n' (1, 9) (1, 10)
48 COMMENT '# NL' (2, 4) (2, 8)
49 NL '\\n' (2, 8) (2, 9)
Albert-Jan Nijburgc471ca42017-05-24 12:31:57 +010050 NL '\\n' (3, 4) (3, 5)
51 INDENT ' ' (4, 0) (4, 4)
52 NAME 'True' (4, 4) (4, 8)
53 OP '=' (4, 9) (4, 10)
54 NAME 'False' (4, 11) (4, 16)
55 COMMENT '# NEWLINE' (4, 17) (4, 26)
56 NEWLINE '\\n' (4, 26) (4, 27)
57 DEDENT '' (5, 0) (5, 0)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030058 """)
59 indent_error_file = b"""\
60def k(x):
61 x += 2
62 x += 5
63"""
64 readline = BytesIO(indent_error_file).readline
65 with self.assertRaisesRegex(IndentationError,
66 "unindent does not match any "
67 "outer indentation level"):
68 for tok in tokenize(readline):
69 pass
Thomas Wouters89f507f2006-12-13 04:49:30 +000070
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030071 def test_int(self):
72 # Ordinary integers and binary operators
73 self.check_tokenize("0xff <= 255", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000074 NUMBER '0xff' (1, 0) (1, 4)
75 OP '<=' (1, 5) (1, 7)
76 NUMBER '255' (1, 8) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030077 """)
78 self.check_tokenize("0b10 <= 255", """\
Eric Smith74ca5572008-03-17 19:49:19 +000079 NUMBER '0b10' (1, 0) (1, 4)
80 OP '<=' (1, 5) (1, 7)
81 NUMBER '255' (1, 8) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030082 """)
83 self.check_tokenize("0o123 <= 0O123", """\
Eric Smith74ca5572008-03-17 19:49:19 +000084 NUMBER '0o123' (1, 0) (1, 5)
85 OP '<=' (1, 6) (1, 8)
86 NUMBER '0O123' (1, 9) (1, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030087 """)
88 self.check_tokenize("1234567 > ~0x15", """\
Mark Dickinson0c1f7c02008-03-16 05:05:12 +000089 NUMBER '1234567' (1, 0) (1, 7)
90 OP '>' (1, 8) (1, 9)
91 OP '~' (1, 10) (1, 11)
92 NUMBER '0x15' (1, 11) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030093 """)
94 self.check_tokenize("2134568 != 1231515", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000095 NUMBER '2134568' (1, 0) (1, 7)
96 OP '!=' (1, 8) (1, 10)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +000097 NUMBER '1231515' (1, 11) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030098 """)
99 self.check_tokenize("(-124561-1) & 200000000", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000100 OP '(' (1, 0) (1, 1)
101 OP '-' (1, 1) (1, 2)
102 NUMBER '124561' (1, 2) (1, 8)
103 OP '-' (1, 8) (1, 9)
104 NUMBER '1' (1, 9) (1, 10)
105 OP ')' (1, 10) (1, 11)
106 OP '&' (1, 12) (1, 13)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000107 NUMBER '200000000' (1, 14) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300108 """)
109 self.check_tokenize("0xdeadbeef != -1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000110 NUMBER '0xdeadbeef' (1, 0) (1, 10)
111 OP '!=' (1, 11) (1, 13)
112 OP '-' (1, 14) (1, 15)
113 NUMBER '1' (1, 15) (1, 16)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300114 """)
115 self.check_tokenize("0xdeadc0de & 12345", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000116 NUMBER '0xdeadc0de' (1, 0) (1, 10)
117 OP '&' (1, 11) (1, 12)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000118 NUMBER '12345' (1, 13) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300119 """)
120 self.check_tokenize("0xFF & 0x15 | 1234", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000121 NUMBER '0xFF' (1, 0) (1, 4)
122 OP '&' (1, 5) (1, 6)
123 NUMBER '0x15' (1, 7) (1, 11)
124 OP '|' (1, 12) (1, 13)
125 NUMBER '1234' (1, 14) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300126 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000127
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300128 def test_long(self):
129 # Long integers
130 self.check_tokenize("x = 0", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000131 NAME 'x' (1, 0) (1, 1)
132 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000133 NUMBER '0' (1, 4) (1, 5)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300134 """)
135 self.check_tokenize("x = 0xfffffffffff", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000136 NAME 'x' (1, 0) (1, 1)
137 OP '=' (1, 2) (1, 3)
Eric V. Smith67317742015-10-16 20:45:53 -0400138 NUMBER '0xfffffffffff' (1, 4) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300139 """)
140 self.check_tokenize("x = 123141242151251616110", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000141 NAME 'x' (1, 0) (1, 1)
142 OP '=' (1, 2) (1, 3)
Eric V. Smith67317742015-10-16 20:45:53 -0400143 NUMBER '123141242151251616110' (1, 4) (1, 25)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300144 """)
145 self.check_tokenize("x = -15921590215012591", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000146 NAME 'x' (1, 0) (1, 1)
147 OP '=' (1, 2) (1, 3)
148 OP '-' (1, 4) (1, 5)
Eric V. Smith67317742015-10-16 20:45:53 -0400149 NUMBER '15921590215012591' (1, 5) (1, 22)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300150 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000151
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300152 def test_float(self):
153 # Floating point numbers
154 self.check_tokenize("x = 3.14159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000155 NAME 'x' (1, 0) (1, 1)
156 OP '=' (1, 2) (1, 3)
157 NUMBER '3.14159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300158 """)
159 self.check_tokenize("x = 314159.", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000160 NAME 'x' (1, 0) (1, 1)
161 OP '=' (1, 2) (1, 3)
162 NUMBER '314159.' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300163 """)
164 self.check_tokenize("x = .314159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000165 NAME 'x' (1, 0) (1, 1)
166 OP '=' (1, 2) (1, 3)
167 NUMBER '.314159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300168 """)
169 self.check_tokenize("x = 3e14159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000170 NAME 'x' (1, 0) (1, 1)
171 OP '=' (1, 2) (1, 3)
172 NUMBER '3e14159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300173 """)
174 self.check_tokenize("x = 3E123", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000175 NAME 'x' (1, 0) (1, 1)
176 OP '=' (1, 2) (1, 3)
177 NUMBER '3E123' (1, 4) (1, 9)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300178 """)
179 self.check_tokenize("x+y = 3e-1230", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000180 NAME 'x' (1, 0) (1, 1)
181 OP '+' (1, 1) (1, 2)
182 NAME 'y' (1, 2) (1, 3)
183 OP '=' (1, 4) (1, 5)
184 NUMBER '3e-1230' (1, 6) (1, 13)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300185 """)
186 self.check_tokenize("x = 3.14e159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000187 NAME 'x' (1, 0) (1, 1)
188 OP '=' (1, 2) (1, 3)
189 NUMBER '3.14e159' (1, 4) (1, 12)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300190 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000191
Brett Cannona721aba2016-09-09 14:57:09 -0700192 def test_underscore_literals(self):
193 def number_token(s):
194 f = BytesIO(s.encode('utf-8'))
195 for toktype, token, start, end, line in tokenize(f.readline):
196 if toktype == NUMBER:
197 return token
198 return 'invalid token'
199 for lit in VALID_UNDERSCORE_LITERALS:
200 if '(' in lit:
201 # this won't work with compound complex inputs
202 continue
203 self.assertEqual(number_token(lit), lit)
204 for lit in INVALID_UNDERSCORE_LITERALS:
205 self.assertNotEqual(number_token(lit), lit)
206
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300207 def test_string(self):
208 # String literals
209 self.check_tokenize("x = ''; y = \"\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000210 NAME 'x' (1, 0) (1, 1)
211 OP '=' (1, 2) (1, 3)
212 STRING "''" (1, 4) (1, 6)
213 OP ';' (1, 6) (1, 7)
214 NAME 'y' (1, 8) (1, 9)
215 OP '=' (1, 10) (1, 11)
216 STRING '""' (1, 12) (1, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300217 """)
218 self.check_tokenize("x = '\"'; y = \"'\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000219 NAME 'x' (1, 0) (1, 1)
220 OP '=' (1, 2) (1, 3)
221 STRING '\\'"\\'' (1, 4) (1, 7)
222 OP ';' (1, 7) (1, 8)
223 NAME 'y' (1, 9) (1, 10)
224 OP '=' (1, 11) (1, 12)
225 STRING '"\\'"' (1, 13) (1, 16)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300226 """)
227 self.check_tokenize("x = \"doesn't \"shrink\", does it\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000228 NAME 'x' (1, 0) (1, 1)
229 OP '=' (1, 2) (1, 3)
230 STRING '"doesn\\'t "' (1, 4) (1, 14)
231 NAME 'shrink' (1, 14) (1, 20)
232 STRING '", does it"' (1, 20) (1, 31)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300233 """)
234 self.check_tokenize("x = 'abc' + 'ABC'", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000235 NAME 'x' (1, 0) (1, 1)
236 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000237 STRING "'abc'" (1, 4) (1, 9)
238 OP '+' (1, 10) (1, 11)
239 STRING "'ABC'" (1, 12) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300240 """)
241 self.check_tokenize('y = "ABC" + "ABC"', """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000242 NAME 'y' (1, 0) (1, 1)
243 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000244 STRING '"ABC"' (1, 4) (1, 9)
245 OP '+' (1, 10) (1, 11)
246 STRING '"ABC"' (1, 12) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300247 """)
248 self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000249 NAME 'x' (1, 0) (1, 1)
250 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000251 STRING "r'abc'" (1, 4) (1, 10)
252 OP '+' (1, 11) (1, 12)
253 STRING "r'ABC'" (1, 13) (1, 19)
254 OP '+' (1, 20) (1, 21)
255 STRING "R'ABC'" (1, 22) (1, 28)
256 OP '+' (1, 29) (1, 30)
257 STRING "R'ABC'" (1, 31) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300258 """)
259 self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000260 NAME 'y' (1, 0) (1, 1)
261 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000262 STRING 'r"abc"' (1, 4) (1, 10)
263 OP '+' (1, 11) (1, 12)
264 STRING 'r"ABC"' (1, 13) (1, 19)
265 OP '+' (1, 20) (1, 21)
266 STRING 'R"ABC"' (1, 22) (1, 28)
267 OP '+' (1, 29) (1, 30)
268 STRING 'R"ABC"' (1, 31) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300269 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000270
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300271 self.check_tokenize("u'abc' + U'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500272 STRING "u'abc'" (1, 0) (1, 6)
273 OP '+' (1, 7) (1, 8)
274 STRING "U'abc'" (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300275 """)
276 self.check_tokenize('u"abc" + U"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500277 STRING 'u"abc"' (1, 0) (1, 6)
278 OP '+' (1, 7) (1, 8)
279 STRING 'U"abc"' (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300280 """)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500281
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300282 self.check_tokenize("b'abc' + B'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500283 STRING "b'abc'" (1, 0) (1, 6)
284 OP '+' (1, 7) (1, 8)
285 STRING "B'abc'" (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300286 """)
287 self.check_tokenize('b"abc" + B"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500288 STRING 'b"abc"' (1, 0) (1, 6)
289 OP '+' (1, 7) (1, 8)
290 STRING 'B"abc"' (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300291 """)
292 self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500293 STRING "br'abc'" (1, 0) (1, 7)
294 OP '+' (1, 8) (1, 9)
295 STRING "bR'abc'" (1, 10) (1, 17)
296 OP '+' (1, 18) (1, 19)
297 STRING "Br'abc'" (1, 20) (1, 27)
298 OP '+' (1, 28) (1, 29)
299 STRING "BR'abc'" (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300300 """)
301 self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500302 STRING 'br"abc"' (1, 0) (1, 7)
303 OP '+' (1, 8) (1, 9)
304 STRING 'bR"abc"' (1, 10) (1, 17)
305 OP '+' (1, 18) (1, 19)
306 STRING 'Br"abc"' (1, 20) (1, 27)
307 OP '+' (1, 28) (1, 29)
308 STRING 'BR"abc"' (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300309 """)
310 self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500311 STRING "rb'abc'" (1, 0) (1, 7)
312 OP '+' (1, 8) (1, 9)
313 STRING "rB'abc'" (1, 10) (1, 17)
314 OP '+' (1, 18) (1, 19)
315 STRING "Rb'abc'" (1, 20) (1, 27)
316 OP '+' (1, 28) (1, 29)
317 STRING "RB'abc'" (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300318 """)
319 self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500320 STRING 'rb"abc"' (1, 0) (1, 7)
321 OP '+' (1, 8) (1, 9)
322 STRING 'rB"abc"' (1, 10) (1, 17)
323 OP '+' (1, 18) (1, 19)
324 STRING 'Rb"abc"' (1, 20) (1, 27)
325 OP '+' (1, 28) (1, 29)
326 STRING 'RB"abc"' (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300327 """)
Eric V. Smith67317742015-10-16 20:45:53 -0400328 # Check 0, 1, and 2 character string prefixes.
329 self.check_tokenize(r'"a\
330de\
331fg"', """\
332 STRING '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3)
333 """)
334 self.check_tokenize(r'u"a\
335de"', """\
336 STRING 'u"a\\\\\\nde"\' (1, 0) (2, 3)
337 """)
338 self.check_tokenize(r'rb"a\
339d"', """\
340 STRING 'rb"a\\\\\\nd"\' (1, 0) (2, 2)
341 """)
342 self.check_tokenize(r'"""a\
343b"""', """\
344 STRING '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
345 """)
346 self.check_tokenize(r'u"""a\
347b"""', """\
348 STRING 'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
349 """)
350 self.check_tokenize(r'rb"""a\
351b\
352c"""', """\
353 STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
354 """)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400355 self.check_tokenize('f"abc"', """\
356 STRING 'f"abc"' (1, 0) (1, 6)
357 """)
358 self.check_tokenize('fR"a{b}c"', """\
359 STRING 'fR"a{b}c"' (1, 0) (1, 9)
360 """)
361 self.check_tokenize('f"""abc"""', """\
362 STRING 'f\"\"\"abc\"\"\"' (1, 0) (1, 10)
363 """)
364 self.check_tokenize(r'f"abc\
365def"', """\
366 STRING 'f"abc\\\\\\ndef"' (1, 0) (2, 4)
367 """)
368 self.check_tokenize(r'Rf"abc\
369def"', """\
370 STRING 'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
371 """)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500372
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300373 def test_function(self):
374 self.check_tokenize("def d22(a, b, c=2, d=2, *k): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000375 NAME 'def' (1, 0) (1, 3)
376 NAME 'd22' (1, 4) (1, 7)
377 OP '(' (1, 7) (1, 8)
378 NAME 'a' (1, 8) (1, 9)
379 OP ',' (1, 9) (1, 10)
380 NAME 'b' (1, 11) (1, 12)
381 OP ',' (1, 12) (1, 13)
382 NAME 'c' (1, 14) (1, 15)
383 OP '=' (1, 15) (1, 16)
384 NUMBER '2' (1, 16) (1, 17)
385 OP ',' (1, 17) (1, 18)
386 NAME 'd' (1, 19) (1, 20)
387 OP '=' (1, 20) (1, 21)
388 NUMBER '2' (1, 21) (1, 22)
389 OP ',' (1, 22) (1, 23)
390 OP '*' (1, 24) (1, 25)
391 NAME 'k' (1, 25) (1, 26)
392 OP ')' (1, 26) (1, 27)
393 OP ':' (1, 27) (1, 28)
394 NAME 'pass' (1, 29) (1, 33)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300395 """)
396 self.check_tokenize("def d01v_(a=1, *k, **w): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000397 NAME 'def' (1, 0) (1, 3)
398 NAME 'd01v_' (1, 4) (1, 9)
399 OP '(' (1, 9) (1, 10)
400 NAME 'a' (1, 10) (1, 11)
401 OP '=' (1, 11) (1, 12)
402 NUMBER '1' (1, 12) (1, 13)
403 OP ',' (1, 13) (1, 14)
404 OP '*' (1, 15) (1, 16)
405 NAME 'k' (1, 16) (1, 17)
406 OP ',' (1, 17) (1, 18)
407 OP '**' (1, 19) (1, 21)
408 NAME 'w' (1, 21) (1, 22)
409 OP ')' (1, 22) (1, 23)
410 OP ':' (1, 23) (1, 24)
411 NAME 'pass' (1, 25) (1, 29)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300412 """)
Jim Fasarakis-Hilliardd4914e92017-03-14 22:16:15 +0200413 self.check_tokenize("def d23(a: str, b: int=3) -> int: pass", """\
414 NAME 'def' (1, 0) (1, 3)
415 NAME 'd23' (1, 4) (1, 7)
416 OP '(' (1, 7) (1, 8)
417 NAME 'a' (1, 8) (1, 9)
418 OP ':' (1, 9) (1, 10)
419 NAME 'str' (1, 11) (1, 14)
420 OP ',' (1, 14) (1, 15)
421 NAME 'b' (1, 16) (1, 17)
422 OP ':' (1, 17) (1, 18)
423 NAME 'int' (1, 19) (1, 22)
424 OP '=' (1, 22) (1, 23)
425 NUMBER '3' (1, 23) (1, 24)
426 OP ')' (1, 24) (1, 25)
427 OP '->' (1, 26) (1, 28)
428 NAME 'int' (1, 29) (1, 32)
429 OP ':' (1, 32) (1, 33)
430 NAME 'pass' (1, 34) (1, 38)
431 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000432
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300433 def test_comparison(self):
434 # Comparison
435 self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != "
436 "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000437 NAME 'if' (1, 0) (1, 2)
438 NUMBER '1' (1, 3) (1, 4)
439 OP '<' (1, 5) (1, 6)
440 NUMBER '1' (1, 7) (1, 8)
441 OP '>' (1, 9) (1, 10)
442 NUMBER '1' (1, 11) (1, 12)
443 OP '==' (1, 13) (1, 15)
444 NUMBER '1' (1, 16) (1, 17)
445 OP '>=' (1, 18) (1, 20)
446 NUMBER '5' (1, 21) (1, 22)
447 OP '<=' (1, 23) (1, 25)
448 NUMBER '0x15' (1, 26) (1, 30)
449 OP '<=' (1, 31) (1, 33)
450 NUMBER '0x12' (1, 34) (1, 38)
451 OP '!=' (1, 39) (1, 41)
452 NUMBER '1' (1, 42) (1, 43)
453 NAME 'and' (1, 44) (1, 47)
454 NUMBER '5' (1, 48) (1, 49)
455 NAME 'in' (1, 50) (1, 52)
456 NUMBER '1' (1, 53) (1, 54)
457 NAME 'not' (1, 55) (1, 58)
458 NAME 'in' (1, 59) (1, 61)
459 NUMBER '1' (1, 62) (1, 63)
460 NAME 'is' (1, 64) (1, 66)
461 NUMBER '1' (1, 67) (1, 68)
462 NAME 'or' (1, 69) (1, 71)
463 NUMBER '5' (1, 72) (1, 73)
464 NAME 'is' (1, 74) (1, 76)
465 NAME 'not' (1, 77) (1, 80)
466 NUMBER '1' (1, 81) (1, 82)
467 OP ':' (1, 82) (1, 83)
468 NAME 'pass' (1, 84) (1, 88)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300469 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000470
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300471 def test_shift(self):
472 # Shift
473 self.check_tokenize("x = 1 << 1 >> 5", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000474 NAME 'x' (1, 0) (1, 1)
475 OP '=' (1, 2) (1, 3)
476 NUMBER '1' (1, 4) (1, 5)
477 OP '<<' (1, 6) (1, 8)
478 NUMBER '1' (1, 9) (1, 10)
479 OP '>>' (1, 11) (1, 13)
480 NUMBER '5' (1, 14) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300481 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000482
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300483 def test_additive(self):
484 # Additive
485 self.check_tokenize("x = 1 - y + 15 - 1 + 0x124 + z + a[5]", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000486 NAME 'x' (1, 0) (1, 1)
487 OP '=' (1, 2) (1, 3)
488 NUMBER '1' (1, 4) (1, 5)
489 OP '-' (1, 6) (1, 7)
490 NAME 'y' (1, 8) (1, 9)
491 OP '+' (1, 10) (1, 11)
492 NUMBER '15' (1, 12) (1, 14)
493 OP '-' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000494 NUMBER '1' (1, 17) (1, 18)
495 OP '+' (1, 19) (1, 20)
496 NUMBER '0x124' (1, 21) (1, 26)
497 OP '+' (1, 27) (1, 28)
498 NAME 'z' (1, 29) (1, 30)
499 OP '+' (1, 31) (1, 32)
500 NAME 'a' (1, 33) (1, 34)
501 OP '[' (1, 34) (1, 35)
502 NUMBER '5' (1, 35) (1, 36)
503 OP ']' (1, 36) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300504 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000505
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300506 def test_multiplicative(self):
507 # Multiplicative
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300508 self.check_tokenize("x = 1//1*1/5*12%0x12@42", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000509 NAME 'x' (1, 0) (1, 1)
510 OP '=' (1, 2) (1, 3)
511 NUMBER '1' (1, 4) (1, 5)
512 OP '//' (1, 5) (1, 7)
513 NUMBER '1' (1, 7) (1, 8)
514 OP '*' (1, 8) (1, 9)
515 NUMBER '1' (1, 9) (1, 10)
516 OP '/' (1, 10) (1, 11)
517 NUMBER '5' (1, 11) (1, 12)
518 OP '*' (1, 12) (1, 13)
519 NUMBER '12' (1, 13) (1, 15)
520 OP '%' (1, 15) (1, 16)
521 NUMBER '0x12' (1, 16) (1, 20)
Benjamin Petersond51374e2014-04-09 23:55:56 -0400522 OP '@' (1, 20) (1, 21)
523 NUMBER '42' (1, 21) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300524 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000525
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300526 def test_unary(self):
527 # Unary
528 self.check_tokenize("~1 ^ 1 & 1 |1 ^ -1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000529 OP '~' (1, 0) (1, 1)
530 NUMBER '1' (1, 1) (1, 2)
531 OP '^' (1, 3) (1, 4)
532 NUMBER '1' (1, 5) (1, 6)
533 OP '&' (1, 7) (1, 8)
534 NUMBER '1' (1, 9) (1, 10)
535 OP '|' (1, 11) (1, 12)
536 NUMBER '1' (1, 12) (1, 13)
537 OP '^' (1, 14) (1, 15)
538 OP '-' (1, 16) (1, 17)
539 NUMBER '1' (1, 17) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300540 """)
541 self.check_tokenize("-1*1/1+1*1//1 - ---1**1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000542 OP '-' (1, 0) (1, 1)
543 NUMBER '1' (1, 1) (1, 2)
544 OP '*' (1, 2) (1, 3)
545 NUMBER '1' (1, 3) (1, 4)
546 OP '/' (1, 4) (1, 5)
547 NUMBER '1' (1, 5) (1, 6)
548 OP '+' (1, 6) (1, 7)
549 NUMBER '1' (1, 7) (1, 8)
550 OP '*' (1, 8) (1, 9)
551 NUMBER '1' (1, 9) (1, 10)
552 OP '//' (1, 10) (1, 12)
553 NUMBER '1' (1, 12) (1, 13)
554 OP '-' (1, 14) (1, 15)
555 OP '-' (1, 16) (1, 17)
556 OP '-' (1, 17) (1, 18)
557 OP '-' (1, 18) (1, 19)
558 NUMBER '1' (1, 19) (1, 20)
559 OP '**' (1, 20) (1, 22)
560 NUMBER '1' (1, 22) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300561 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000562
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300563 def test_selector(self):
564 # Selector
565 self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000566 NAME 'import' (1, 0) (1, 6)
567 NAME 'sys' (1, 7) (1, 10)
568 OP ',' (1, 10) (1, 11)
569 NAME 'time' (1, 12) (1, 16)
570 NEWLINE '\\n' (1, 16) (1, 17)
571 NAME 'x' (2, 0) (2, 1)
572 OP '=' (2, 2) (2, 3)
573 NAME 'sys' (2, 4) (2, 7)
574 OP '.' (2, 7) (2, 8)
575 NAME 'modules' (2, 8) (2, 15)
576 OP '[' (2, 15) (2, 16)
577 STRING "'time'" (2, 16) (2, 22)
578 OP ']' (2, 22) (2, 23)
579 OP '.' (2, 23) (2, 24)
580 NAME 'time' (2, 24) (2, 28)
581 OP '(' (2, 28) (2, 29)
582 OP ')' (2, 29) (2, 30)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300583 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000584
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300585 def test_method(self):
586 # Methods
587 self.check_tokenize("@staticmethod\ndef foo(x,y): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000588 OP '@' (1, 0) (1, 1)
Eric V. Smith67317742015-10-16 20:45:53 -0400589 NAME 'staticmethod' (1, 1) (1, 13)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000590 NEWLINE '\\n' (1, 13) (1, 14)
591 NAME 'def' (2, 0) (2, 3)
592 NAME 'foo' (2, 4) (2, 7)
593 OP '(' (2, 7) (2, 8)
594 NAME 'x' (2, 8) (2, 9)
595 OP ',' (2, 9) (2, 10)
596 NAME 'y' (2, 10) (2, 11)
597 OP ')' (2, 11) (2, 12)
598 OP ':' (2, 12) (2, 13)
599 NAME 'pass' (2, 14) (2, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300600 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000601
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300602 def test_tabs(self):
603 # Evil tabs
604 self.check_tokenize("def f():\n"
605 "\tif x\n"
606 " \tpass", """\
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000607 NAME 'def' (1, 0) (1, 3)
608 NAME 'f' (1, 4) (1, 5)
609 OP '(' (1, 5) (1, 6)
610 OP ')' (1, 6) (1, 7)
611 OP ':' (1, 7) (1, 8)
612 NEWLINE '\\n' (1, 8) (1, 9)
613 INDENT '\\t' (2, 0) (2, 1)
614 NAME 'if' (2, 1) (2, 3)
615 NAME 'x' (2, 4) (2, 5)
616 NEWLINE '\\n' (2, 5) (2, 6)
617 INDENT ' \\t' (3, 0) (3, 9)
618 NAME 'pass' (3, 9) (3, 13)
619 DEDENT '' (4, 0) (4, 0)
620 DEDENT '' (4, 0) (4, 0)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300621 """)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000622
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300623 def test_non_ascii_identifiers(self):
624 # Non-ascii identifiers
625 self.check_tokenize("Örter = 'places'\ngrün = 'green'", """\
Benjamin Peterson33856de2010-08-30 14:41:20 +0000626 NAME 'Örter' (1, 0) (1, 5)
627 OP '=' (1, 6) (1, 7)
628 STRING "'places'" (1, 8) (1, 16)
629 NEWLINE '\\n' (1, 16) (1, 17)
630 NAME 'grün' (2, 0) (2, 4)
631 OP '=' (2, 5) (2, 6)
632 STRING "'green'" (2, 7) (2, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300633 """)
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000634
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300635 def test_unicode(self):
636 # Legacy unicode literals:
637 self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000638 NAME 'Örter' (1, 0) (1, 5)
639 OP '=' (1, 6) (1, 7)
640 STRING "u'places'" (1, 8) (1, 17)
641 NEWLINE '\\n' (1, 17) (1, 18)
642 NAME 'grün' (2, 0) (2, 4)
643 OP '=' (2, 5) (2, 6)
Christian Heimes0b3847d2012-06-20 11:17:58 +0200644 STRING "U'green'" (2, 7) (2, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300645 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400646
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300647 def test_async(self):
648 # Async/await extension:
649 self.check_tokenize("async = 1", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400650 NAME 'async' (1, 0) (1, 5)
651 OP '=' (1, 6) (1, 7)
652 NUMBER '1' (1, 8) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300653 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400654
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300655 self.check_tokenize("a = (async = 1)", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400656 NAME 'a' (1, 0) (1, 1)
657 OP '=' (1, 2) (1, 3)
658 OP '(' (1, 4) (1, 5)
659 NAME 'async' (1, 5) (1, 10)
660 OP '=' (1, 11) (1, 12)
661 NUMBER '1' (1, 13) (1, 14)
662 OP ')' (1, 14) (1, 15)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300663 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400664
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300665 self.check_tokenize("async()", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400666 NAME 'async' (1, 0) (1, 5)
667 OP '(' (1, 5) (1, 6)
668 OP ')' (1, 6) (1, 7)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300669 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400670
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300671 self.check_tokenize("class async(Bar):pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400672 NAME 'class' (1, 0) (1, 5)
673 NAME 'async' (1, 6) (1, 11)
674 OP '(' (1, 11) (1, 12)
675 NAME 'Bar' (1, 12) (1, 15)
676 OP ')' (1, 15) (1, 16)
677 OP ':' (1, 16) (1, 17)
678 NAME 'pass' (1, 17) (1, 21)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300679 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400680
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300681 self.check_tokenize("class async:pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400682 NAME 'class' (1, 0) (1, 5)
683 NAME 'async' (1, 6) (1, 11)
684 OP ':' (1, 11) (1, 12)
685 NAME 'pass' (1, 12) (1, 16)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300686 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400687
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300688 self.check_tokenize("await = 1", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400689 NAME 'await' (1, 0) (1, 5)
690 OP '=' (1, 6) (1, 7)
691 NUMBER '1' (1, 8) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300692 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400693
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300694 self.check_tokenize("foo.async", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400695 NAME 'foo' (1, 0) (1, 3)
696 OP '.' (1, 3) (1, 4)
697 NAME 'async' (1, 4) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300698 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400699
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300700 self.check_tokenize("async for a in b: pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400701 NAME 'async' (1, 0) (1, 5)
702 NAME 'for' (1, 6) (1, 9)
703 NAME 'a' (1, 10) (1, 11)
704 NAME 'in' (1, 12) (1, 14)
705 NAME 'b' (1, 15) (1, 16)
706 OP ':' (1, 16) (1, 17)
707 NAME 'pass' (1, 18) (1, 22)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300708 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400709
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300710 self.check_tokenize("async with a as b: pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400711 NAME 'async' (1, 0) (1, 5)
712 NAME 'with' (1, 6) (1, 10)
713 NAME 'a' (1, 11) (1, 12)
714 NAME 'as' (1, 13) (1, 15)
715 NAME 'b' (1, 16) (1, 17)
716 OP ':' (1, 17) (1, 18)
717 NAME 'pass' (1, 19) (1, 23)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300718 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400719
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300720 self.check_tokenize("async.foo", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400721 NAME 'async' (1, 0) (1, 5)
722 OP '.' (1, 5) (1, 6)
723 NAME 'foo' (1, 6) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300724 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400725
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300726 self.check_tokenize("async", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400727 NAME 'async' (1, 0) (1, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300728 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400729
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300730 self.check_tokenize("async\n#comment\nawait", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400731 NAME 'async' (1, 0) (1, 5)
732 NEWLINE '\\n' (1, 5) (1, 6)
733 COMMENT '#comment' (2, 0) (2, 8)
734 NL '\\n' (2, 8) (2, 9)
735 NAME 'await' (3, 0) (3, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300736 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400737
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300738 self.check_tokenize("async\n...\nawait", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400739 NAME 'async' (1, 0) (1, 5)
740 NEWLINE '\\n' (1, 5) (1, 6)
741 OP '...' (2, 0) (2, 3)
742 NEWLINE '\\n' (2, 3) (2, 4)
743 NAME 'await' (3, 0) (3, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300744 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400745
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300746 self.check_tokenize("async\nawait", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400747 NAME 'async' (1, 0) (1, 5)
748 NEWLINE '\\n' (1, 5) (1, 6)
749 NAME 'await' (2, 0) (2, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300750 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400751
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300752 self.check_tokenize("foo.async + 1", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400753 NAME 'foo' (1, 0) (1, 3)
754 OP '.' (1, 3) (1, 4)
755 NAME 'async' (1, 4) (1, 9)
756 OP '+' (1, 10) (1, 11)
757 NUMBER '1' (1, 12) (1, 13)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300758 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400759
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300760 self.check_tokenize("async def foo(): pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400761 ASYNC 'async' (1, 0) (1, 5)
762 NAME 'def' (1, 6) (1, 9)
763 NAME 'foo' (1, 10) (1, 13)
764 OP '(' (1, 13) (1, 14)
765 OP ')' (1, 14) (1, 15)
766 OP ':' (1, 15) (1, 16)
767 NAME 'pass' (1, 17) (1, 21)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300768 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400769
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300770 self.check_tokenize('''\
771async def foo():
772 def foo(await):
773 await = 1
774 if 1:
775 await
776async += 1
777''', """\
Yury Selivanov75445082015-05-11 22:57:16 -0400778 ASYNC 'async' (1, 0) (1, 5)
779 NAME 'def' (1, 6) (1, 9)
780 NAME 'foo' (1, 10) (1, 13)
781 OP '(' (1, 13) (1, 14)
782 OP ')' (1, 14) (1, 15)
783 OP ':' (1, 15) (1, 16)
784 NEWLINE '\\n' (1, 16) (1, 17)
785 INDENT ' ' (2, 0) (2, 2)
786 NAME 'def' (2, 2) (2, 5)
787 NAME 'foo' (2, 6) (2, 9)
788 OP '(' (2, 9) (2, 10)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300789 AWAIT 'await' (2, 10) (2, 15)
Yury Selivanov75445082015-05-11 22:57:16 -0400790 OP ')' (2, 15) (2, 16)
791 OP ':' (2, 16) (2, 17)
792 NEWLINE '\\n' (2, 17) (2, 18)
793 INDENT ' ' (3, 0) (3, 4)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300794 AWAIT 'await' (3, 4) (3, 9)
Yury Selivanov75445082015-05-11 22:57:16 -0400795 OP '=' (3, 10) (3, 11)
796 NUMBER '1' (3, 12) (3, 13)
797 NEWLINE '\\n' (3, 13) (3, 14)
798 DEDENT '' (4, 2) (4, 2)
799 NAME 'if' (4, 2) (4, 4)
800 NUMBER '1' (4, 5) (4, 6)
801 OP ':' (4, 6) (4, 7)
802 NEWLINE '\\n' (4, 7) (4, 8)
803 INDENT ' ' (5, 0) (5, 4)
804 AWAIT 'await' (5, 4) (5, 9)
805 NEWLINE '\\n' (5, 9) (5, 10)
806 DEDENT '' (6, 0) (6, 0)
807 DEDENT '' (6, 0) (6, 0)
808 NAME 'async' (6, 0) (6, 5)
809 OP '+=' (6, 6) (6, 8)
810 NUMBER '1' (6, 9) (6, 10)
811 NEWLINE '\\n' (6, 10) (6, 11)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300812 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400813
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300814 self.check_tokenize('''\
815async def foo():
816 async for i in 1: pass''', """\
Yury Selivanov75445082015-05-11 22:57:16 -0400817 ASYNC 'async' (1, 0) (1, 5)
818 NAME 'def' (1, 6) (1, 9)
819 NAME 'foo' (1, 10) (1, 13)
820 OP '(' (1, 13) (1, 14)
821 OP ')' (1, 14) (1, 15)
822 OP ':' (1, 15) (1, 16)
823 NEWLINE '\\n' (1, 16) (1, 17)
824 INDENT ' ' (2, 0) (2, 2)
825 ASYNC 'async' (2, 2) (2, 7)
826 NAME 'for' (2, 8) (2, 11)
827 NAME 'i' (2, 12) (2, 13)
828 NAME 'in' (2, 14) (2, 16)
829 NUMBER '1' (2, 17) (2, 18)
830 OP ':' (2, 18) (2, 19)
831 NAME 'pass' (2, 20) (2, 24)
832 DEDENT '' (3, 0) (3, 0)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300833 """)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300834
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300835 self.check_tokenize('''async def foo(async): await''', """\
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300836 ASYNC 'async' (1, 0) (1, 5)
837 NAME 'def' (1, 6) (1, 9)
838 NAME 'foo' (1, 10) (1, 13)
839 OP '(' (1, 13) (1, 14)
840 ASYNC 'async' (1, 14) (1, 19)
841 OP ')' (1, 19) (1, 20)
842 OP ':' (1, 20) (1, 21)
843 AWAIT 'await' (1, 22) (1, 27)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300844 """)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300845
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300846 self.check_tokenize('''\
847def f():
848
849 def baz(): pass
850 async def bar(): pass
851
852 await = 2''', """\
Yury Selivanov96ec9342015-07-23 15:01:58 +0300853 NAME 'def' (1, 0) (1, 3)
854 NAME 'f' (1, 4) (1, 5)
855 OP '(' (1, 5) (1, 6)
856 OP ')' (1, 6) (1, 7)
857 OP ':' (1, 7) (1, 8)
858 NEWLINE '\\n' (1, 8) (1, 9)
859 NL '\\n' (2, 0) (2, 1)
860 INDENT ' ' (3, 0) (3, 2)
861 NAME 'def' (3, 2) (3, 5)
862 NAME 'baz' (3, 6) (3, 9)
863 OP '(' (3, 9) (3, 10)
864 OP ')' (3, 10) (3, 11)
865 OP ':' (3, 11) (3, 12)
866 NAME 'pass' (3, 13) (3, 17)
867 NEWLINE '\\n' (3, 17) (3, 18)
868 ASYNC 'async' (4, 2) (4, 7)
869 NAME 'def' (4, 8) (4, 11)
870 NAME 'bar' (4, 12) (4, 15)
871 OP '(' (4, 15) (4, 16)
872 OP ')' (4, 16) (4, 17)
873 OP ':' (4, 17) (4, 18)
874 NAME 'pass' (4, 19) (4, 23)
875 NEWLINE '\\n' (4, 23) (4, 24)
876 NL '\\n' (5, 0) (5, 1)
877 NAME 'await' (6, 2) (6, 7)
878 OP '=' (6, 8) (6, 9)
879 NUMBER '2' (6, 10) (6, 11)
880 DEDENT '' (7, 0) (7, 0)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300881 """)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300882
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300883 self.check_tokenize('''\
884async def f():
885
886 def baz(): pass
887 async def bar(): pass
888
889 await = 2''', """\
Yury Selivanov96ec9342015-07-23 15:01:58 +0300890 ASYNC 'async' (1, 0) (1, 5)
891 NAME 'def' (1, 6) (1, 9)
892 NAME 'f' (1, 10) (1, 11)
893 OP '(' (1, 11) (1, 12)
894 OP ')' (1, 12) (1, 13)
895 OP ':' (1, 13) (1, 14)
896 NEWLINE '\\n' (1, 14) (1, 15)
897 NL '\\n' (2, 0) (2, 1)
898 INDENT ' ' (3, 0) (3, 2)
899 NAME 'def' (3, 2) (3, 5)
900 NAME 'baz' (3, 6) (3, 9)
901 OP '(' (3, 9) (3, 10)
902 OP ')' (3, 10) (3, 11)
903 OP ':' (3, 11) (3, 12)
904 NAME 'pass' (3, 13) (3, 17)
905 NEWLINE '\\n' (3, 17) (3, 18)
906 ASYNC 'async' (4, 2) (4, 7)
907 NAME 'def' (4, 8) (4, 11)
908 NAME 'bar' (4, 12) (4, 15)
909 OP '(' (4, 15) (4, 16)
910 OP ')' (4, 16) (4, 17)
911 OP ':' (4, 17) (4, 18)
912 NAME 'pass' (4, 19) (4, 23)
913 NEWLINE '\\n' (4, 23) (4, 24)
914 NL '\\n' (5, 0) (5, 1)
915 AWAIT 'await' (6, 2) (6, 7)
916 OP '=' (6, 8) (6, 9)
917 NUMBER '2' (6, 10) (6, 11)
918 DEDENT '' (7, 0) (7, 0)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300919 """)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000920
Raymond Hettinger68c04532005-06-10 11:05:19 +0000921
Raymond Hettinger68c04532005-06-10 11:05:19 +0000922def decistmt(s):
Raymond Hettinger68c04532005-06-10 11:05:19 +0000923 result = []
Trent Nelson428de652008-03-18 22:41:35 +0000924 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
Raymond Hettinger68c04532005-06-10 11:05:19 +0000925 for toknum, tokval, _, _, _ in g:
926 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
927 result.extend([
928 (NAME, 'Decimal'),
929 (OP, '('),
930 (STRING, repr(tokval)),
931 (OP, ')')
932 ])
933 else:
934 result.append((toknum, tokval))
Trent Nelson428de652008-03-18 22:41:35 +0000935 return untokenize(result).decode('utf-8')
936
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300937class TestMisc(TestCase):
938
939 def test_decistmt(self):
940 # Substitute Decimals for floats in a string of statements.
941 # This is an example from the docs.
942
943 from decimal import Decimal
944 s = '+21.3e-5*-.1234/81.7'
945 self.assertEqual(decistmt(s),
946 "+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')")
947
948 # The format of the exponent is inherited from the platform C library.
949 # Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
950 # we're only showing 11 digits, and the 12th isn't close to 5, the
951 # rest of the output should be platform-independent.
952 self.assertRegex(repr(eval(s)), '-3.2171603427[0-9]*e-0+7')
953
954 # Output from calculations with Decimal should be identical across all
955 # platforms.
956 self.assertEqual(eval(decistmt(s)),
957 Decimal('-3.217160342717258261933904529E-7'))
958
Trent Nelson428de652008-03-18 22:41:35 +0000959
960class TestTokenizerAdheresToPep0263(TestCase):
961 """
962 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
963 """
964
965 def _testFile(self, filename):
966 path = os.path.join(os.path.dirname(__file__), filename)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300967 TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
Trent Nelson428de652008-03-18 22:41:35 +0000968
969 def test_utf8_coding_cookie_and_no_utf8_bom(self):
Ned Deily2ea6fcc2011-07-19 16:15:27 -0700970 f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300971 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +0000972
973 def test_latin1_coding_cookie_and_utf8_bom(self):
974 """
975 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
976 allowed encoding for the comment is 'utf-8'. The text file used in
977 this test starts with a BOM signature, but specifies latin1 as the
978 coding, so verify that a SyntaxError is raised, which matches the
979 behaviour of the interpreter when it encounters a similar condition.
980 """
981 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000982 self.assertRaises(SyntaxError, self._testFile, f)
Trent Nelson428de652008-03-18 22:41:35 +0000983
984 def test_no_coding_cookie_and_utf8_bom(self):
985 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300986 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +0000987
988 def test_utf8_coding_cookie_and_utf8_bom(self):
989 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300990 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +0000991
Florent Xicluna11f0b412012-07-07 12:13:35 +0200992 def test_bad_coding_cookie(self):
993 self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')
994 self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')
995
Trent Nelson428de652008-03-18 22:41:35 +0000996
997class Test_Tokenize(TestCase):
998
999 def test__tokenize_decodes_with_specified_encoding(self):
1000 literal = '"ЉЊЈЁЂ"'
1001 line = literal.encode('utf-8')
1002 first = False
1003 def readline():
1004 nonlocal first
1005 if not first:
1006 first = True
1007 return line
1008 else:
1009 return b''
1010
1011 # skip the initial encoding token and the end token
1012 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
1013 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +00001014 self.assertEqual(tokens, expected_tokens,
1015 "bytes not decoded with encoding")
Trent Nelson428de652008-03-18 22:41:35 +00001016
1017 def test__tokenize_does_not_decode_with_encoding_none(self):
1018 literal = '"ЉЊЈЁЂ"'
1019 first = False
1020 def readline():
1021 nonlocal first
1022 if not first:
1023 first = True
1024 return literal
1025 else:
1026 return b''
1027
1028 # skip the end token
1029 tokens = list(_tokenize(readline, encoding=None))[:-1]
1030 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +00001031 self.assertEqual(tokens, expected_tokens,
1032 "string not tokenized when encoding is None")
Trent Nelson428de652008-03-18 22:41:35 +00001033
1034
1035class TestDetectEncoding(TestCase):
1036
1037 def get_readline(self, lines):
1038 index = 0
1039 def readline():
1040 nonlocal index
1041 if index == len(lines):
1042 raise StopIteration
1043 line = lines[index]
1044 index += 1
1045 return line
1046 return readline
1047
1048 def test_no_bom_no_encoding_cookie(self):
1049 lines = (
1050 b'# something\n',
1051 b'print(something)\n',
1052 b'do_something(else)\n'
1053 )
1054 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001055 self.assertEqual(encoding, 'utf-8')
1056 self.assertEqual(consumed_lines, list(lines[:2]))
Trent Nelson428de652008-03-18 22:41:35 +00001057
1058 def test_bom_no_cookie(self):
1059 lines = (
1060 b'\xef\xbb\xbf# something\n',
1061 b'print(something)\n',
1062 b'do_something(else)\n'
1063 )
1064 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001065 self.assertEqual(encoding, 'utf-8-sig')
1066 self.assertEqual(consumed_lines,
1067 [b'# something\n', b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001068
1069 def test_cookie_first_line_no_bom(self):
1070 lines = (
1071 b'# -*- coding: latin-1 -*-\n',
1072 b'print(something)\n',
1073 b'do_something(else)\n'
1074 )
1075 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001076 self.assertEqual(encoding, 'iso-8859-1')
1077 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001078
1079 def test_matched_bom_and_cookie_first_line(self):
1080 lines = (
1081 b'\xef\xbb\xbf# coding=utf-8\n',
1082 b'print(something)\n',
1083 b'do_something(else)\n'
1084 )
1085 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001086 self.assertEqual(encoding, 'utf-8-sig')
1087 self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001088
1089 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
1090 lines = (
1091 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
1092 b'print(something)\n',
1093 b'do_something(else)\n'
1094 )
1095 readline = self.get_readline(lines)
1096 self.assertRaises(SyntaxError, detect_encoding, readline)
1097
1098 def test_cookie_second_line_no_bom(self):
1099 lines = (
1100 b'#! something\n',
1101 b'# vim: set fileencoding=ascii :\n',
1102 b'print(something)\n',
1103 b'do_something(else)\n'
1104 )
1105 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001106 self.assertEqual(encoding, 'ascii')
Trent Nelson428de652008-03-18 22:41:35 +00001107 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
Ezio Melottib3aedd42010-11-20 19:04:17 +00001108 self.assertEqual(consumed_lines, expected)
Trent Nelson428de652008-03-18 22:41:35 +00001109
1110 def test_matched_bom_and_cookie_second_line(self):
1111 lines = (
1112 b'\xef\xbb\xbf#! something\n',
1113 b'f# coding=utf-8\n',
1114 b'print(something)\n',
1115 b'do_something(else)\n'
1116 )
1117 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001118 self.assertEqual(encoding, 'utf-8-sig')
1119 self.assertEqual(consumed_lines,
1120 [b'#! something\n', b'f# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001121
1122 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
1123 lines = (
1124 b'\xef\xbb\xbf#! something\n',
1125 b'# vim: set fileencoding=ascii :\n',
1126 b'print(something)\n',
1127 b'do_something(else)\n'
1128 )
1129 readline = self.get_readline(lines)
1130 self.assertRaises(SyntaxError, detect_encoding, readline)
1131
Serhiy Storchaka768c16c2014-01-09 18:36:09 +02001132 def test_cookie_second_line_noncommented_first_line(self):
1133 lines = (
1134 b"print('\xc2\xa3')\n",
1135 b'# vim: set fileencoding=iso8859-15 :\n',
1136 b"print('\xe2\x82\xac')\n"
1137 )
1138 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1139 self.assertEqual(encoding, 'utf-8')
1140 expected = [b"print('\xc2\xa3')\n"]
1141 self.assertEqual(consumed_lines, expected)
1142
1143 def test_cookie_second_line_commented_first_line(self):
1144 lines = (
1145 b"#print('\xc2\xa3')\n",
1146 b'# vim: set fileencoding=iso8859-15 :\n',
1147 b"print('\xe2\x82\xac')\n"
1148 )
1149 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1150 self.assertEqual(encoding, 'iso8859-15')
1151 expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
1152 self.assertEqual(consumed_lines, expected)
1153
1154 def test_cookie_second_line_empty_first_line(self):
1155 lines = (
1156 b'\n',
1157 b'# vim: set fileencoding=iso8859-15 :\n',
1158 b"print('\xe2\x82\xac')\n"
1159 )
1160 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1161 self.assertEqual(encoding, 'iso8859-15')
1162 expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
1163 self.assertEqual(consumed_lines, expected)
1164
Benjamin Petersond3afada2009-10-09 21:43:09 +00001165 def test_latin1_normalization(self):
1166 # See get_normal_name() in tokenizer.c.
1167 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
1168 "iso-8859-1-unix", "iso-latin-1-mac")
1169 for encoding in encodings:
1170 for rep in ("-", "_"):
1171 enc = encoding.replace("-", rep)
1172 lines = (b"#!/usr/bin/python\n",
1173 b"# coding: " + enc.encode("ascii") + b"\n",
1174 b"print(things)\n",
1175 b"do_something += 4\n")
1176 rl = self.get_readline(lines)
1177 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001178 self.assertEqual(found, "iso-8859-1")
Benjamin Petersond3afada2009-10-09 21:43:09 +00001179
Martin v. Löwis63674f42012-04-20 14:36:47 +02001180 def test_syntaxerror_latin1(self):
1181 # Issue 14629: need to raise SyntaxError if the first
1182 # line(s) have non-UTF-8 characters
1183 lines = (
1184 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1185 )
1186 readline = self.get_readline(lines)
1187 self.assertRaises(SyntaxError, detect_encoding, readline)
1188
1189
Benjamin Petersond3afada2009-10-09 21:43:09 +00001190 def test_utf8_normalization(self):
1191 # See get_normal_name() in tokenizer.c.
1192 encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
1193 for encoding in encodings:
1194 for rep in ("-", "_"):
1195 enc = encoding.replace("-", rep)
1196 lines = (b"#!/usr/bin/python\n",
1197 b"# coding: " + enc.encode("ascii") + b"\n",
1198 b"1 + 3\n")
1199 rl = self.get_readline(lines)
1200 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001201 self.assertEqual(found, "utf-8")
Benjamin Petersond3afada2009-10-09 21:43:09 +00001202
Trent Nelson428de652008-03-18 22:41:35 +00001203 def test_short_files(self):
1204 readline = self.get_readline((b'print(something)\n',))
1205 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001206 self.assertEqual(encoding, 'utf-8')
1207 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001208
1209 encoding, consumed_lines = detect_encoding(self.get_readline(()))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001210 self.assertEqual(encoding, 'utf-8')
1211 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +00001212
1213 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
1214 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001215 self.assertEqual(encoding, 'utf-8-sig')
1216 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001217
1218 readline = self.get_readline((b'\xef\xbb\xbf',))
1219 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001220 self.assertEqual(encoding, 'utf-8-sig')
1221 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +00001222
Benjamin Peterson433f32c2008-12-12 01:25:05 +00001223 readline = self.get_readline((b'# coding: bad\n',))
1224 self.assertRaises(SyntaxError, detect_encoding, readline)
Trent Nelson428de652008-03-18 22:41:35 +00001225
Serhiy Storchakadafea852013-09-16 23:51:56 +03001226 def test_false_encoding(self):
1227 # Issue 18873: "Encoding" detected in non-comment lines
1228 readline = self.get_readline((b'print("#coding=fake")',))
1229 encoding, consumed_lines = detect_encoding(readline)
1230 self.assertEqual(encoding, 'utf-8')
1231 self.assertEqual(consumed_lines, [b'print("#coding=fake")'])
1232
Victor Stinner58c07522010-11-09 01:08:59 +00001233 def test_open(self):
1234 filename = support.TESTFN + '.py'
1235 self.addCleanup(support.unlink, filename)
1236
1237 # test coding cookie
1238 for encoding in ('iso-8859-15', 'utf-8'):
1239 with open(filename, 'w', encoding=encoding) as fp:
1240 print("# coding: %s" % encoding, file=fp)
1241 print("print('euro:\u20ac')", file=fp)
1242 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001243 self.assertEqual(fp.encoding, encoding)
1244 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001245
1246 # test BOM (no coding cookie)
1247 with open(filename, 'w', encoding='utf-8-sig') as fp:
1248 print("print('euro:\u20ac')", file=fp)
1249 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001250 self.assertEqual(fp.encoding, 'utf-8-sig')
1251 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001252
Brett Cannonc33f3f22012-04-20 13:23:54 -04001253 def test_filename_in_exception(self):
1254 # When possible, include the file name in the exception.
1255 path = 'some_file_path'
1256 lines = (
1257 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1258 )
1259 class Bunk:
1260 def __init__(self, lines, path):
1261 self.name = path
1262 self._lines = lines
1263 self._index = 0
1264
1265 def readline(self):
1266 if self._index == len(lines):
1267 raise StopIteration
1268 line = lines[self._index]
1269 self._index += 1
1270 return line
1271
1272 with self.assertRaises(SyntaxError):
1273 ins = Bunk(lines, path)
1274 # Make sure lacking a name isn't an issue.
1275 del ins.name
1276 detect_encoding(ins.readline)
1277 with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
1278 ins = Bunk(lines, path)
1279 detect_encoding(ins.readline)
1280
Victor Stinner387729e2015-05-26 00:43:58 +02001281 def test_open_error(self):
1282 # Issue #23840: open() must close the binary file on error
1283 m = BytesIO(b'#coding:xxx')
1284 with mock.patch('tokenize._builtin_open', return_value=m):
1285 self.assertRaises(SyntaxError, tokenize_open, 'foobar')
1286 self.assertTrue(m.closed)
1287
1288
Trent Nelson428de652008-03-18 22:41:35 +00001289class TestTokenize(TestCase):
1290
1291 def test_tokenize(self):
1292 import tokenize as tokenize_module
1293 encoding = object()
1294 encoding_used = None
1295 def mock_detect_encoding(readline):
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001296 return encoding, [b'first', b'second']
Trent Nelson428de652008-03-18 22:41:35 +00001297
1298 def mock__tokenize(readline, encoding):
1299 nonlocal encoding_used
1300 encoding_used = encoding
1301 out = []
1302 while True:
1303 next_line = readline()
1304 if next_line:
1305 out.append(next_line)
1306 continue
1307 return out
1308
1309 counter = 0
1310 def mock_readline():
1311 nonlocal counter
1312 counter += 1
1313 if counter == 5:
1314 return b''
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001315 return str(counter).encode()
Trent Nelson428de652008-03-18 22:41:35 +00001316
1317 orig_detect_encoding = tokenize_module.detect_encoding
1318 orig__tokenize = tokenize_module._tokenize
1319 tokenize_module.detect_encoding = mock_detect_encoding
1320 tokenize_module._tokenize = mock__tokenize
1321 try:
1322 results = tokenize(mock_readline)
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001323 self.assertEqual(list(results),
1324 [b'first', b'second', b'1', b'2', b'3', b'4'])
Trent Nelson428de652008-03-18 22:41:35 +00001325 finally:
1326 tokenize_module.detect_encoding = orig_detect_encoding
1327 tokenize_module._tokenize = orig__tokenize
1328
1329 self.assertTrue(encoding_used, encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +00001330
Yury Selivanov8085b802015-05-18 12:50:52 -04001331 def test_oneline_defs(self):
1332 buf = []
1333 for i in range(500):
1334 buf.append('def i{i}(): return {i}'.format(i=i))
1335 buf.append('OK')
1336 buf = '\n'.join(buf)
1337
1338 # Test that 500 consequent, one-line defs is OK
1339 toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
1340 self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER
1341
Meador Inge00c7f852012-01-19 00:44:45 -06001342 def assertExactTypeEqual(self, opstr, *optypes):
1343 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
1344 num_optypes = len(optypes)
1345 self.assertEqual(len(tokens), 2 + num_optypes)
1346 self.assertEqual(token.tok_name[tokens[0].exact_type],
1347 token.tok_name[ENCODING])
1348 for i in range(num_optypes):
1349 self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
1350 token.tok_name[optypes[i]])
1351 self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
1352 token.tok_name[token.ENDMARKER])
1353
1354 def test_exact_type(self):
1355 self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
1356 self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
1357 self.assertExactTypeEqual(':', token.COLON)
1358 self.assertExactTypeEqual(',', token.COMMA)
1359 self.assertExactTypeEqual(';', token.SEMI)
1360 self.assertExactTypeEqual('+', token.PLUS)
1361 self.assertExactTypeEqual('-', token.MINUS)
1362 self.assertExactTypeEqual('*', token.STAR)
1363 self.assertExactTypeEqual('/', token.SLASH)
1364 self.assertExactTypeEqual('|', token.VBAR)
1365 self.assertExactTypeEqual('&', token.AMPER)
1366 self.assertExactTypeEqual('<', token.LESS)
1367 self.assertExactTypeEqual('>', token.GREATER)
1368 self.assertExactTypeEqual('=', token.EQUAL)
1369 self.assertExactTypeEqual('.', token.DOT)
1370 self.assertExactTypeEqual('%', token.PERCENT)
1371 self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
1372 self.assertExactTypeEqual('==', token.EQEQUAL)
1373 self.assertExactTypeEqual('!=', token.NOTEQUAL)
1374 self.assertExactTypeEqual('<=', token.LESSEQUAL)
1375 self.assertExactTypeEqual('>=', token.GREATEREQUAL)
1376 self.assertExactTypeEqual('~', token.TILDE)
1377 self.assertExactTypeEqual('^', token.CIRCUMFLEX)
1378 self.assertExactTypeEqual('<<', token.LEFTSHIFT)
1379 self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
1380 self.assertExactTypeEqual('**', token.DOUBLESTAR)
1381 self.assertExactTypeEqual('+=', token.PLUSEQUAL)
1382 self.assertExactTypeEqual('-=', token.MINEQUAL)
1383 self.assertExactTypeEqual('*=', token.STAREQUAL)
1384 self.assertExactTypeEqual('/=', token.SLASHEQUAL)
1385 self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
1386 self.assertExactTypeEqual('&=', token.AMPEREQUAL)
1387 self.assertExactTypeEqual('|=', token.VBAREQUAL)
1388 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1389 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1390 self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
1391 self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
1392 self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
1393 self.assertExactTypeEqual('//', token.DOUBLESLASH)
1394 self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
Jim Fasarakis-Hilliardd4914e92017-03-14 22:16:15 +02001395 self.assertExactTypeEqual('...', token.ELLIPSIS)
1396 self.assertExactTypeEqual('->', token.RARROW)
Meador Inge00c7f852012-01-19 00:44:45 -06001397 self.assertExactTypeEqual('@', token.AT)
Benjamin Petersond51374e2014-04-09 23:55:56 -04001398 self.assertExactTypeEqual('@=', token.ATEQUAL)
Meador Inge00c7f852012-01-19 00:44:45 -06001399
1400 self.assertExactTypeEqual('a**2+b**2==c**2',
1401 NAME, token.DOUBLESTAR, NUMBER,
1402 token.PLUS,
1403 NAME, token.DOUBLESTAR, NUMBER,
1404 token.EQEQUAL,
1405 NAME, token.DOUBLESTAR, NUMBER)
1406 self.assertExactTypeEqual('{1, 2, 3}',
1407 token.LBRACE,
1408 token.NUMBER, token.COMMA,
1409 token.NUMBER, token.COMMA,
1410 token.NUMBER,
1411 token.RBRACE)
1412 self.assertExactTypeEqual('^(x & 0x1)',
1413 token.CIRCUMFLEX,
1414 token.LPAR,
1415 token.NAME, token.AMPER, token.NUMBER,
1416 token.RPAR)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001417
Ezio Melottifafa8b72012-11-03 17:46:51 +02001418 def test_pathological_trailing_whitespace(self):
1419 # See http://bugs.python.org/issue16152
1420 self.assertExactTypeEqual('@ ', token.AT)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001421
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001422
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001423class UntokenizeTest(TestCase):
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001424
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001425 def test_bad_input_order(self):
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001426 # raise if previous row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001427 u = Untokenizer()
1428 u.prev_row = 2
1429 u.prev_col = 2
1430 with self.assertRaises(ValueError) as cm:
1431 u.add_whitespace((1,3))
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001432 self.assertEqual(cm.exception.args[0],
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001433 'start (1,3) precedes previous end (2,2)')
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001434 # raise if previous column in row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001435 self.assertRaises(ValueError, u.add_whitespace, (2,1))
1436
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001437 def test_backslash_continuation(self):
1438 # The problem is that <whitespace>\<newline> leaves no token
1439 u = Untokenizer()
1440 u.prev_row = 1
1441 u.prev_col = 1
1442 u.tokens = []
1443 u.add_whitespace((2, 0))
1444 self.assertEqual(u.tokens, ['\\\n'])
1445 u.prev_row = 2
1446 u.add_whitespace((4, 4))
1447 self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' '])
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001448 TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n')
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001449
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -05001450 def test_iter_compat(self):
1451 u = Untokenizer()
1452 token = (NAME, 'Hello')
1453 tokens = [(ENCODING, 'utf-8'), token]
1454 u.compat(token, iter([]))
1455 self.assertEqual(u.tokens, ["Hello "])
1456 u = Untokenizer()
1457 self.assertEqual(u.untokenize(iter([token])), 'Hello ')
1458 u = Untokenizer()
1459 self.assertEqual(u.untokenize(iter(tokens)), 'Hello ')
1460 self.assertEqual(u.encoding, 'utf-8')
1461 self.assertEqual(untokenize(iter(tokens)), b'Hello ')
1462
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001463
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001464class TestRoundtrip(TestCase):
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001465
1466 def check_roundtrip(self, f):
1467 """
1468 Test roundtrip for `untokenize`. `f` is an open file or a string.
1469 The source code in f is tokenized to both 5- and 2-tuples.
1470 Both sequences are converted back to source code via
1471 tokenize.untokenize(), and the latter tokenized again to 2-tuples.
1472 The test fails if the 3 pair tokenizations do not match.
1473
1474 When untokenize bugs are fixed, untokenize with 5-tuples should
1475 reproduce code that does not contain a backslash continuation
1476 following spaces. A proper test should test this.
1477 """
1478 # Get source code and original tokenizations
1479 if isinstance(f, str):
1480 code = f.encode('utf-8')
1481 else:
1482 code = f.read()
1483 f.close()
1484 readline = iter(code.splitlines(keepends=True)).__next__
1485 tokens5 = list(tokenize(readline))
1486 tokens2 = [tok[:2] for tok in tokens5]
1487 # Reproduce tokens2 from pairs
1488 bytes_from2 = untokenize(tokens2)
1489 readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__
1490 tokens2_from2 = [tok[:2] for tok in tokenize(readline2)]
1491 self.assertEqual(tokens2_from2, tokens2)
1492 # Reproduce tokens2 from 5-tuples
1493 bytes_from5 = untokenize(tokens5)
1494 readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__
1495 tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
1496 self.assertEqual(tokens2_from5, tokens2)
1497
1498 def test_roundtrip(self):
1499 # There are some standard formatting practices that are easy to get right.
1500
1501 self.check_roundtrip("if x == 1:\n"
1502 " print(x)\n")
1503 self.check_roundtrip("# This is a comment\n"
1504 "# This also")
1505
1506 # Some people use different formatting conventions, which makes
1507 # untokenize a little trickier. Note that this test involves trailing
1508 # whitespace after the colon. Note that we use hex escapes to make the
1509 # two trailing blanks apparent in the expected output.
1510
1511 self.check_roundtrip("if x == 1 : \n"
1512 " print(x)\n")
1513 fn = support.findfile("tokenize_tests.txt")
1514 with open(fn, 'rb') as f:
1515 self.check_roundtrip(f)
1516 self.check_roundtrip("if x == 1:\n"
1517 " # A comment by itself.\n"
1518 " print(x) # Comment here, too.\n"
1519 " # Another comment.\n"
1520 "after_if = True\n")
1521 self.check_roundtrip("if (x # The comments need to go in the right place\n"
1522 " == 1):\n"
1523 " print('x==1')\n")
1524 self.check_roundtrip("class Test: # A comment here\n"
1525 " # A comment with weird indent\n"
1526 " after_com = 5\n"
1527 " def x(m): return m*5 # a one liner\n"
1528 " def y(m): # A whitespace after the colon\n"
1529 " return y*4 # 3-space indent\n")
1530
1531 # Some error-handling code
1532 self.check_roundtrip("try: import somemodule\n"
1533 "except ImportError: # comment\n"
1534 " print('Can not import' # comment2\n)"
1535 "else: print('Loaded')\n")
1536
1537 def test_continuation(self):
1538 # Balancing continuation
1539 self.check_roundtrip("a = (3,4, \n"
1540 "5,6)\n"
1541 "y = [3, 4,\n"
1542 "5]\n"
1543 "z = {'a': 5,\n"
1544 "'b':15, 'c':True}\n"
1545 "x = len(y) + 5 - a[\n"
1546 "3] - a[2]\n"
1547 "+ len(z) - z[\n"
1548 "'b']\n")
1549
1550 def test_backslash_continuation(self):
1551 # Backslash means line continuation, except for comments
1552 self.check_roundtrip("x=1+\\\n"
1553 "1\n"
1554 "# This is a comment\\\n"
1555 "# This also\n")
1556 self.check_roundtrip("# Comment \\\n"
1557 "x = 0")
1558
1559 def test_string_concatenation(self):
1560 # Two string literals on the same line
1561 self.check_roundtrip("'' ''")
1562
1563 def test_random_files(self):
1564 # Test roundtrip on random python modules.
1565 # pass the '-ucpu' option to process the full directory.
1566
1567 import glob, random
1568 fn = support.findfile("tokenize_tests.txt")
1569 tempdir = os.path.dirname(fn) or os.curdir
1570 testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
1571
Brett Cannona721aba2016-09-09 14:57:09 -07001572 # Tokenize is broken on test_pep3131.py because regular expressions are
1573 # broken on the obscure unicode identifiers in it. *sigh*
1574 # With roundtrip extended to test the 5-tuple mode of untokenize,
1575 # 7 more testfiles fail. Remove them also until the failure is diagnosed.
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001576
Zachary Ware724f6a62016-09-09 12:55:37 -07001577 testfiles.remove(os.path.join(tempdir, "test_unicode_identifiers.py"))
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001578 for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'):
1579 testfiles.remove(os.path.join(tempdir, "test_%s.py") % f)
1580
1581 if not support.is_resource_enabled("cpu"):
1582 testfiles = random.sample(testfiles, 10)
1583
1584 for testfile in testfiles:
1585 with open(testfile, 'rb') as f:
1586 with self.subTest(file=testfile):
1587 self.check_roundtrip(f)
1588
1589
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001590 def roundtrip(self, code):
1591 if isinstance(code, str):
1592 code = code.encode('utf-8')
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001593 return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001594
1595 def test_indentation_semantics_retained(self):
1596 """
1597 Ensure that although whitespace might be mutated in a roundtrip,
1598 the semantic meaning of the indentation remains consistent.
1599 """
1600 code = "if False:\n\tx=3\n\tx=3\n"
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001601 codelines = self.roundtrip(code).split('\n')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001602 self.assertEqual(codelines[1], codelines[2])
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001603 self.check_roundtrip(code)
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001604
1605
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001606if __name__ == "__main__":
Brett Cannona721aba2016-09-09 14:57:09 -07001607 unittest.main()