blob: 681f2c72f9c37854fa2a8648bd79a47d0cd43025 [file] [log] [blame]
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001from test import support
Hai Shi46605972020-08-04 00:49:18 +08002from test.support import os_helper
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03003from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
4 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
Ammar Askarc4ef4892018-07-06 03:19:08 -04005 open as tokenize_open, Untokenizer, generate_tokens,
6 NEWLINE)
Thomas Kluyverc56b17b2018-06-05 19:26:39 +02007from io import BytesIO, StringIO
Stéphane Wirtel90addd62017-07-25 15:33:53 +02008import unittest
Brett Cannona721aba2016-09-09 14:57:09 -07009from unittest import TestCase, mock
10from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
11 INVALID_UNDERSCORE_LITERALS)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030012import os
13import token
Thomas Wouters89f507f2006-12-13 04:49:30 +000014
Thomas Wouters89f507f2006-12-13 04:49:30 +000015
Ammar Askarc4ef4892018-07-06 03:19:08 -040016# Converts a source string into a list of textual representation
17# of the tokens such as:
18# ` NAME 'if' (1, 0) (1, 2)`
19# to make writing tests easier.
20def stringify_tokens_from_source(token_generator, source_string):
21 result = []
22 num_lines = len(source_string.splitlines())
23 missing_trailing_nl = source_string[-1] not in '\r\n'
24
25 for type, token, start, end, line in token_generator:
26 if type == ENDMARKER:
27 break
28 # Ignore the new line on the last line if the input lacks one
29 if missing_trailing_nl and type == NEWLINE and end[0] == num_lines:
30 continue
31 type = tok_name[type]
32 result.append(f" {type:10} {token!r:13} {start} {end}")
33
34 return result
35
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030036class TokenizeTest(TestCase):
37 # Tests for the tokenize module.
Jason R. Coombs7cf36382015-06-20 19:13:50 -040038
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030039 # The tests can be really simple. Given a small fragment of source
Ammar Askarc4ef4892018-07-06 03:19:08 -040040 # code, print out a table with tokens. The ENDMARKER, ENCODING and
41 # final NEWLINE are omitted for brevity.
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030042
43 def check_tokenize(self, s, expected):
44 # Format the tokens in s in a table format.
Ammar Askarc4ef4892018-07-06 03:19:08 -040045 # The ENDMARKER and final NEWLINE are omitted.
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030046 f = BytesIO(s.encode('utf-8'))
Ammar Askarc4ef4892018-07-06 03:19:08 -040047 result = stringify_tokens_from_source(tokenize(f.readline), s)
48
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030049 self.assertEqual(result,
50 [" ENCODING 'utf-8' (0, 0) (0, 0)"] +
51 expected.rstrip().splitlines())
52
Ammar Askarc4ef4892018-07-06 03:19:08 -040053 def test_implicit_newline(self):
54 # Make sure that the tokenizer puts in an implicit NEWLINE
55 # when the input lacks a trailing new line.
56 f = BytesIO("x".encode('utf-8'))
57 tokens = list(tokenize(f.readline))
58 self.assertEqual(tokens[-2].type, NEWLINE)
59 self.assertEqual(tokens[-1].type, ENDMARKER)
60
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030061 def test_basic(self):
62 self.check_tokenize("1 + 1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000063 NUMBER '1' (1, 0) (1, 1)
64 OP '+' (1, 2) (1, 3)
65 NUMBER '1' (1, 4) (1, 5)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030066 """)
67 self.check_tokenize("if False:\n"
68 " # NL\n"
Albert-Jan Nijburgc471ca42017-05-24 12:31:57 +010069 " \n"
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030070 " True = False # NEWLINE\n", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000071 NAME 'if' (1, 0) (1, 2)
72 NAME 'False' (1, 3) (1, 8)
73 OP ':' (1, 8) (1, 9)
74 NEWLINE '\\n' (1, 9) (1, 10)
75 COMMENT '# NL' (2, 4) (2, 8)
76 NL '\\n' (2, 8) (2, 9)
Albert-Jan Nijburgc471ca42017-05-24 12:31:57 +010077 NL '\\n' (3, 4) (3, 5)
78 INDENT ' ' (4, 0) (4, 4)
79 NAME 'True' (4, 4) (4, 8)
80 OP '=' (4, 9) (4, 10)
81 NAME 'False' (4, 11) (4, 16)
82 COMMENT '# NEWLINE' (4, 17) (4, 26)
83 NEWLINE '\\n' (4, 26) (4, 27)
84 DEDENT '' (5, 0) (5, 0)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030085 """)
86 indent_error_file = b"""\
87def k(x):
88 x += 2
89 x += 5
90"""
91 readline = BytesIO(indent_error_file).readline
92 with self.assertRaisesRegex(IndentationError,
93 "unindent does not match any "
94 "outer indentation level"):
95 for tok in tokenize(readline):
96 pass
Thomas Wouters89f507f2006-12-13 04:49:30 +000097
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030098 def test_int(self):
99 # Ordinary integers and binary operators
100 self.check_tokenize("0xff <= 255", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000101 NUMBER '0xff' (1, 0) (1, 4)
102 OP '<=' (1, 5) (1, 7)
103 NUMBER '255' (1, 8) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300104 """)
105 self.check_tokenize("0b10 <= 255", """\
Eric Smith74ca5572008-03-17 19:49:19 +0000106 NUMBER '0b10' (1, 0) (1, 4)
107 OP '<=' (1, 5) (1, 7)
108 NUMBER '255' (1, 8) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300109 """)
110 self.check_tokenize("0o123 <= 0O123", """\
Eric Smith74ca5572008-03-17 19:49:19 +0000111 NUMBER '0o123' (1, 0) (1, 5)
112 OP '<=' (1, 6) (1, 8)
113 NUMBER '0O123' (1, 9) (1, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300114 """)
115 self.check_tokenize("1234567 > ~0x15", """\
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000116 NUMBER '1234567' (1, 0) (1, 7)
117 OP '>' (1, 8) (1, 9)
118 OP '~' (1, 10) (1, 11)
119 NUMBER '0x15' (1, 11) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300120 """)
121 self.check_tokenize("2134568 != 1231515", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000122 NUMBER '2134568' (1, 0) (1, 7)
123 OP '!=' (1, 8) (1, 10)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000124 NUMBER '1231515' (1, 11) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300125 """)
126 self.check_tokenize("(-124561-1) & 200000000", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000127 OP '(' (1, 0) (1, 1)
128 OP '-' (1, 1) (1, 2)
129 NUMBER '124561' (1, 2) (1, 8)
130 OP '-' (1, 8) (1, 9)
131 NUMBER '1' (1, 9) (1, 10)
132 OP ')' (1, 10) (1, 11)
133 OP '&' (1, 12) (1, 13)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000134 NUMBER '200000000' (1, 14) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300135 """)
136 self.check_tokenize("0xdeadbeef != -1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000137 NUMBER '0xdeadbeef' (1, 0) (1, 10)
138 OP '!=' (1, 11) (1, 13)
139 OP '-' (1, 14) (1, 15)
140 NUMBER '1' (1, 15) (1, 16)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300141 """)
142 self.check_tokenize("0xdeadc0de & 12345", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000143 NUMBER '0xdeadc0de' (1, 0) (1, 10)
144 OP '&' (1, 11) (1, 12)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000145 NUMBER '12345' (1, 13) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300146 """)
147 self.check_tokenize("0xFF & 0x15 | 1234", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000148 NUMBER '0xFF' (1, 0) (1, 4)
149 OP '&' (1, 5) (1, 6)
150 NUMBER '0x15' (1, 7) (1, 11)
151 OP '|' (1, 12) (1, 13)
152 NUMBER '1234' (1, 14) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300153 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000154
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300155 def test_long(self):
156 # Long integers
157 self.check_tokenize("x = 0", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000158 NAME 'x' (1, 0) (1, 1)
159 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000160 NUMBER '0' (1, 4) (1, 5)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300161 """)
162 self.check_tokenize("x = 0xfffffffffff", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000163 NAME 'x' (1, 0) (1, 1)
164 OP '=' (1, 2) (1, 3)
Eric V. Smith67317742015-10-16 20:45:53 -0400165 NUMBER '0xfffffffffff' (1, 4) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300166 """)
167 self.check_tokenize("x = 123141242151251616110", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000168 NAME 'x' (1, 0) (1, 1)
169 OP '=' (1, 2) (1, 3)
Eric V. Smith67317742015-10-16 20:45:53 -0400170 NUMBER '123141242151251616110' (1, 4) (1, 25)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300171 """)
172 self.check_tokenize("x = -15921590215012591", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000173 NAME 'x' (1, 0) (1, 1)
174 OP '=' (1, 2) (1, 3)
175 OP '-' (1, 4) (1, 5)
Eric V. Smith67317742015-10-16 20:45:53 -0400176 NUMBER '15921590215012591' (1, 5) (1, 22)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300177 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000178
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300179 def test_float(self):
180 # Floating point numbers
181 self.check_tokenize("x = 3.14159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000182 NAME 'x' (1, 0) (1, 1)
183 OP '=' (1, 2) (1, 3)
184 NUMBER '3.14159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300185 """)
186 self.check_tokenize("x = 314159.", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000187 NAME 'x' (1, 0) (1, 1)
188 OP '=' (1, 2) (1, 3)
189 NUMBER '314159.' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300190 """)
191 self.check_tokenize("x = .314159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000192 NAME 'x' (1, 0) (1, 1)
193 OP '=' (1, 2) (1, 3)
194 NUMBER '.314159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300195 """)
196 self.check_tokenize("x = 3e14159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000197 NAME 'x' (1, 0) (1, 1)
198 OP '=' (1, 2) (1, 3)
199 NUMBER '3e14159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300200 """)
201 self.check_tokenize("x = 3E123", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000202 NAME 'x' (1, 0) (1, 1)
203 OP '=' (1, 2) (1, 3)
204 NUMBER '3E123' (1, 4) (1, 9)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300205 """)
206 self.check_tokenize("x+y = 3e-1230", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000207 NAME 'x' (1, 0) (1, 1)
208 OP '+' (1, 1) (1, 2)
209 NAME 'y' (1, 2) (1, 3)
210 OP '=' (1, 4) (1, 5)
211 NUMBER '3e-1230' (1, 6) (1, 13)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300212 """)
213 self.check_tokenize("x = 3.14e159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000214 NAME 'x' (1, 0) (1, 1)
215 OP '=' (1, 2) (1, 3)
216 NUMBER '3.14e159' (1, 4) (1, 12)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300217 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000218
Brett Cannona721aba2016-09-09 14:57:09 -0700219 def test_underscore_literals(self):
220 def number_token(s):
221 f = BytesIO(s.encode('utf-8'))
222 for toktype, token, start, end, line in tokenize(f.readline):
223 if toktype == NUMBER:
224 return token
225 return 'invalid token'
226 for lit in VALID_UNDERSCORE_LITERALS:
227 if '(' in lit:
228 # this won't work with compound complex inputs
229 continue
230 self.assertEqual(number_token(lit), lit)
231 for lit in INVALID_UNDERSCORE_LITERALS:
232 self.assertNotEqual(number_token(lit), lit)
233
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300234 def test_string(self):
235 # String literals
236 self.check_tokenize("x = ''; y = \"\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000237 NAME 'x' (1, 0) (1, 1)
238 OP '=' (1, 2) (1, 3)
239 STRING "''" (1, 4) (1, 6)
240 OP ';' (1, 6) (1, 7)
241 NAME 'y' (1, 8) (1, 9)
242 OP '=' (1, 10) (1, 11)
243 STRING '""' (1, 12) (1, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300244 """)
245 self.check_tokenize("x = '\"'; y = \"'\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000246 NAME 'x' (1, 0) (1, 1)
247 OP '=' (1, 2) (1, 3)
248 STRING '\\'"\\'' (1, 4) (1, 7)
249 OP ';' (1, 7) (1, 8)
250 NAME 'y' (1, 9) (1, 10)
251 OP '=' (1, 11) (1, 12)
252 STRING '"\\'"' (1, 13) (1, 16)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300253 """)
254 self.check_tokenize("x = \"doesn't \"shrink\", does it\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000255 NAME 'x' (1, 0) (1, 1)
256 OP '=' (1, 2) (1, 3)
257 STRING '"doesn\\'t "' (1, 4) (1, 14)
258 NAME 'shrink' (1, 14) (1, 20)
259 STRING '", does it"' (1, 20) (1, 31)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300260 """)
261 self.check_tokenize("x = 'abc' + 'ABC'", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000262 NAME 'x' (1, 0) (1, 1)
263 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000264 STRING "'abc'" (1, 4) (1, 9)
265 OP '+' (1, 10) (1, 11)
266 STRING "'ABC'" (1, 12) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300267 """)
268 self.check_tokenize('y = "ABC" + "ABC"', """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000269 NAME 'y' (1, 0) (1, 1)
270 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000271 STRING '"ABC"' (1, 4) (1, 9)
272 OP '+' (1, 10) (1, 11)
273 STRING '"ABC"' (1, 12) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300274 """)
275 self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000276 NAME 'x' (1, 0) (1, 1)
277 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000278 STRING "r'abc'" (1, 4) (1, 10)
279 OP '+' (1, 11) (1, 12)
280 STRING "r'ABC'" (1, 13) (1, 19)
281 OP '+' (1, 20) (1, 21)
282 STRING "R'ABC'" (1, 22) (1, 28)
283 OP '+' (1, 29) (1, 30)
284 STRING "R'ABC'" (1, 31) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300285 """)
286 self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000287 NAME 'y' (1, 0) (1, 1)
288 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000289 STRING 'r"abc"' (1, 4) (1, 10)
290 OP '+' (1, 11) (1, 12)
291 STRING 'r"ABC"' (1, 13) (1, 19)
292 OP '+' (1, 20) (1, 21)
293 STRING 'R"ABC"' (1, 22) (1, 28)
294 OP '+' (1, 29) (1, 30)
295 STRING 'R"ABC"' (1, 31) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300296 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000297
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300298 self.check_tokenize("u'abc' + U'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500299 STRING "u'abc'" (1, 0) (1, 6)
300 OP '+' (1, 7) (1, 8)
301 STRING "U'abc'" (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300302 """)
303 self.check_tokenize('u"abc" + U"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500304 STRING 'u"abc"' (1, 0) (1, 6)
305 OP '+' (1, 7) (1, 8)
306 STRING 'U"abc"' (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300307 """)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500308
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300309 self.check_tokenize("b'abc' + B'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500310 STRING "b'abc'" (1, 0) (1, 6)
311 OP '+' (1, 7) (1, 8)
312 STRING "B'abc'" (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300313 """)
314 self.check_tokenize('b"abc" + B"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500315 STRING 'b"abc"' (1, 0) (1, 6)
316 OP '+' (1, 7) (1, 8)
317 STRING 'B"abc"' (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300318 """)
319 self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500320 STRING "br'abc'" (1, 0) (1, 7)
321 OP '+' (1, 8) (1, 9)
322 STRING "bR'abc'" (1, 10) (1, 17)
323 OP '+' (1, 18) (1, 19)
324 STRING "Br'abc'" (1, 20) (1, 27)
325 OP '+' (1, 28) (1, 29)
326 STRING "BR'abc'" (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300327 """)
328 self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500329 STRING 'br"abc"' (1, 0) (1, 7)
330 OP '+' (1, 8) (1, 9)
331 STRING 'bR"abc"' (1, 10) (1, 17)
332 OP '+' (1, 18) (1, 19)
333 STRING 'Br"abc"' (1, 20) (1, 27)
334 OP '+' (1, 28) (1, 29)
335 STRING 'BR"abc"' (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300336 """)
337 self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500338 STRING "rb'abc'" (1, 0) (1, 7)
339 OP '+' (1, 8) (1, 9)
340 STRING "rB'abc'" (1, 10) (1, 17)
341 OP '+' (1, 18) (1, 19)
342 STRING "Rb'abc'" (1, 20) (1, 27)
343 OP '+' (1, 28) (1, 29)
344 STRING "RB'abc'" (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300345 """)
346 self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500347 STRING 'rb"abc"' (1, 0) (1, 7)
348 OP '+' (1, 8) (1, 9)
349 STRING 'rB"abc"' (1, 10) (1, 17)
350 OP '+' (1, 18) (1, 19)
351 STRING 'Rb"abc"' (1, 20) (1, 27)
352 OP '+' (1, 28) (1, 29)
353 STRING 'RB"abc"' (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300354 """)
Eric V. Smith67317742015-10-16 20:45:53 -0400355 # Check 0, 1, and 2 character string prefixes.
356 self.check_tokenize(r'"a\
357de\
358fg"', """\
359 STRING '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3)
360 """)
361 self.check_tokenize(r'u"a\
362de"', """\
363 STRING 'u"a\\\\\\nde"\' (1, 0) (2, 3)
364 """)
365 self.check_tokenize(r'rb"a\
366d"', """\
367 STRING 'rb"a\\\\\\nd"\' (1, 0) (2, 2)
368 """)
369 self.check_tokenize(r'"""a\
370b"""', """\
371 STRING '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
372 """)
373 self.check_tokenize(r'u"""a\
374b"""', """\
375 STRING 'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
376 """)
377 self.check_tokenize(r'rb"""a\
378b\
379c"""', """\
380 STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
381 """)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400382 self.check_tokenize('f"abc"', """\
383 STRING 'f"abc"' (1, 0) (1, 6)
384 """)
385 self.check_tokenize('fR"a{b}c"', """\
386 STRING 'fR"a{b}c"' (1, 0) (1, 9)
387 """)
388 self.check_tokenize('f"""abc"""', """\
389 STRING 'f\"\"\"abc\"\"\"' (1, 0) (1, 10)
390 """)
391 self.check_tokenize(r'f"abc\
392def"', """\
393 STRING 'f"abc\\\\\\ndef"' (1, 0) (2, 4)
394 """)
395 self.check_tokenize(r'Rf"abc\
396def"', """\
397 STRING 'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
398 """)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500399
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300400 def test_function(self):
401 self.check_tokenize("def d22(a, b, c=2, d=2, *k): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000402 NAME 'def' (1, 0) (1, 3)
403 NAME 'd22' (1, 4) (1, 7)
404 OP '(' (1, 7) (1, 8)
405 NAME 'a' (1, 8) (1, 9)
406 OP ',' (1, 9) (1, 10)
407 NAME 'b' (1, 11) (1, 12)
408 OP ',' (1, 12) (1, 13)
409 NAME 'c' (1, 14) (1, 15)
410 OP '=' (1, 15) (1, 16)
411 NUMBER '2' (1, 16) (1, 17)
412 OP ',' (1, 17) (1, 18)
413 NAME 'd' (1, 19) (1, 20)
414 OP '=' (1, 20) (1, 21)
415 NUMBER '2' (1, 21) (1, 22)
416 OP ',' (1, 22) (1, 23)
417 OP '*' (1, 24) (1, 25)
418 NAME 'k' (1, 25) (1, 26)
419 OP ')' (1, 26) (1, 27)
420 OP ':' (1, 27) (1, 28)
421 NAME 'pass' (1, 29) (1, 33)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300422 """)
423 self.check_tokenize("def d01v_(a=1, *k, **w): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000424 NAME 'def' (1, 0) (1, 3)
425 NAME 'd01v_' (1, 4) (1, 9)
426 OP '(' (1, 9) (1, 10)
427 NAME 'a' (1, 10) (1, 11)
428 OP '=' (1, 11) (1, 12)
429 NUMBER '1' (1, 12) (1, 13)
430 OP ',' (1, 13) (1, 14)
431 OP '*' (1, 15) (1, 16)
432 NAME 'k' (1, 16) (1, 17)
433 OP ',' (1, 17) (1, 18)
434 OP '**' (1, 19) (1, 21)
435 NAME 'w' (1, 21) (1, 22)
436 OP ')' (1, 22) (1, 23)
437 OP ':' (1, 23) (1, 24)
438 NAME 'pass' (1, 25) (1, 29)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300439 """)
Jim Fasarakis-Hilliardd4914e92017-03-14 22:16:15 +0200440 self.check_tokenize("def d23(a: str, b: int=3) -> int: pass", """\
441 NAME 'def' (1, 0) (1, 3)
442 NAME 'd23' (1, 4) (1, 7)
443 OP '(' (1, 7) (1, 8)
444 NAME 'a' (1, 8) (1, 9)
445 OP ':' (1, 9) (1, 10)
446 NAME 'str' (1, 11) (1, 14)
447 OP ',' (1, 14) (1, 15)
448 NAME 'b' (1, 16) (1, 17)
449 OP ':' (1, 17) (1, 18)
450 NAME 'int' (1, 19) (1, 22)
451 OP '=' (1, 22) (1, 23)
452 NUMBER '3' (1, 23) (1, 24)
453 OP ')' (1, 24) (1, 25)
454 OP '->' (1, 26) (1, 28)
455 NAME 'int' (1, 29) (1, 32)
456 OP ':' (1, 32) (1, 33)
457 NAME 'pass' (1, 34) (1, 38)
458 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000459
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300460 def test_comparison(self):
461 # Comparison
462 self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != "
463 "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000464 NAME 'if' (1, 0) (1, 2)
465 NUMBER '1' (1, 3) (1, 4)
466 OP '<' (1, 5) (1, 6)
467 NUMBER '1' (1, 7) (1, 8)
468 OP '>' (1, 9) (1, 10)
469 NUMBER '1' (1, 11) (1, 12)
470 OP '==' (1, 13) (1, 15)
471 NUMBER '1' (1, 16) (1, 17)
472 OP '>=' (1, 18) (1, 20)
473 NUMBER '5' (1, 21) (1, 22)
474 OP '<=' (1, 23) (1, 25)
475 NUMBER '0x15' (1, 26) (1, 30)
476 OP '<=' (1, 31) (1, 33)
477 NUMBER '0x12' (1, 34) (1, 38)
478 OP '!=' (1, 39) (1, 41)
479 NUMBER '1' (1, 42) (1, 43)
480 NAME 'and' (1, 44) (1, 47)
481 NUMBER '5' (1, 48) (1, 49)
482 NAME 'in' (1, 50) (1, 52)
483 NUMBER '1' (1, 53) (1, 54)
484 NAME 'not' (1, 55) (1, 58)
485 NAME 'in' (1, 59) (1, 61)
486 NUMBER '1' (1, 62) (1, 63)
487 NAME 'is' (1, 64) (1, 66)
488 NUMBER '1' (1, 67) (1, 68)
489 NAME 'or' (1, 69) (1, 71)
490 NUMBER '5' (1, 72) (1, 73)
491 NAME 'is' (1, 74) (1, 76)
492 NAME 'not' (1, 77) (1, 80)
493 NUMBER '1' (1, 81) (1, 82)
494 OP ':' (1, 82) (1, 83)
495 NAME 'pass' (1, 84) (1, 88)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300496 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000497
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300498 def test_shift(self):
499 # Shift
500 self.check_tokenize("x = 1 << 1 >> 5", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000501 NAME 'x' (1, 0) (1, 1)
502 OP '=' (1, 2) (1, 3)
503 NUMBER '1' (1, 4) (1, 5)
504 OP '<<' (1, 6) (1, 8)
505 NUMBER '1' (1, 9) (1, 10)
506 OP '>>' (1, 11) (1, 13)
507 NUMBER '5' (1, 14) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300508 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000509
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300510 def test_additive(self):
511 # Additive
512 self.check_tokenize("x = 1 - y + 15 - 1 + 0x124 + z + a[5]", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000513 NAME 'x' (1, 0) (1, 1)
514 OP '=' (1, 2) (1, 3)
515 NUMBER '1' (1, 4) (1, 5)
516 OP '-' (1, 6) (1, 7)
517 NAME 'y' (1, 8) (1, 9)
518 OP '+' (1, 10) (1, 11)
519 NUMBER '15' (1, 12) (1, 14)
520 OP '-' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000521 NUMBER '1' (1, 17) (1, 18)
522 OP '+' (1, 19) (1, 20)
523 NUMBER '0x124' (1, 21) (1, 26)
524 OP '+' (1, 27) (1, 28)
525 NAME 'z' (1, 29) (1, 30)
526 OP '+' (1, 31) (1, 32)
527 NAME 'a' (1, 33) (1, 34)
528 OP '[' (1, 34) (1, 35)
529 NUMBER '5' (1, 35) (1, 36)
530 OP ']' (1, 36) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300531 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000532
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300533 def test_multiplicative(self):
534 # Multiplicative
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300535 self.check_tokenize("x = 1//1*1/5*12%0x12@42", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000536 NAME 'x' (1, 0) (1, 1)
537 OP '=' (1, 2) (1, 3)
538 NUMBER '1' (1, 4) (1, 5)
539 OP '//' (1, 5) (1, 7)
540 NUMBER '1' (1, 7) (1, 8)
541 OP '*' (1, 8) (1, 9)
542 NUMBER '1' (1, 9) (1, 10)
543 OP '/' (1, 10) (1, 11)
544 NUMBER '5' (1, 11) (1, 12)
545 OP '*' (1, 12) (1, 13)
546 NUMBER '12' (1, 13) (1, 15)
547 OP '%' (1, 15) (1, 16)
548 NUMBER '0x12' (1, 16) (1, 20)
Benjamin Petersond51374e2014-04-09 23:55:56 -0400549 OP '@' (1, 20) (1, 21)
550 NUMBER '42' (1, 21) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300551 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000552
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300553 def test_unary(self):
554 # Unary
555 self.check_tokenize("~1 ^ 1 & 1 |1 ^ -1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000556 OP '~' (1, 0) (1, 1)
557 NUMBER '1' (1, 1) (1, 2)
558 OP '^' (1, 3) (1, 4)
559 NUMBER '1' (1, 5) (1, 6)
560 OP '&' (1, 7) (1, 8)
561 NUMBER '1' (1, 9) (1, 10)
562 OP '|' (1, 11) (1, 12)
563 NUMBER '1' (1, 12) (1, 13)
564 OP '^' (1, 14) (1, 15)
565 OP '-' (1, 16) (1, 17)
566 NUMBER '1' (1, 17) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300567 """)
568 self.check_tokenize("-1*1/1+1*1//1 - ---1**1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000569 OP '-' (1, 0) (1, 1)
570 NUMBER '1' (1, 1) (1, 2)
571 OP '*' (1, 2) (1, 3)
572 NUMBER '1' (1, 3) (1, 4)
573 OP '/' (1, 4) (1, 5)
574 NUMBER '1' (1, 5) (1, 6)
575 OP '+' (1, 6) (1, 7)
576 NUMBER '1' (1, 7) (1, 8)
577 OP '*' (1, 8) (1, 9)
578 NUMBER '1' (1, 9) (1, 10)
579 OP '//' (1, 10) (1, 12)
580 NUMBER '1' (1, 12) (1, 13)
581 OP '-' (1, 14) (1, 15)
582 OP '-' (1, 16) (1, 17)
583 OP '-' (1, 17) (1, 18)
584 OP '-' (1, 18) (1, 19)
585 NUMBER '1' (1, 19) (1, 20)
586 OP '**' (1, 20) (1, 22)
587 NUMBER '1' (1, 22) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300588 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000589
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300590 def test_selector(self):
591 # Selector
592 self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000593 NAME 'import' (1, 0) (1, 6)
594 NAME 'sys' (1, 7) (1, 10)
595 OP ',' (1, 10) (1, 11)
596 NAME 'time' (1, 12) (1, 16)
597 NEWLINE '\\n' (1, 16) (1, 17)
598 NAME 'x' (2, 0) (2, 1)
599 OP '=' (2, 2) (2, 3)
600 NAME 'sys' (2, 4) (2, 7)
601 OP '.' (2, 7) (2, 8)
602 NAME 'modules' (2, 8) (2, 15)
603 OP '[' (2, 15) (2, 16)
604 STRING "'time'" (2, 16) (2, 22)
605 OP ']' (2, 22) (2, 23)
606 OP '.' (2, 23) (2, 24)
607 NAME 'time' (2, 24) (2, 28)
608 OP '(' (2, 28) (2, 29)
609 OP ')' (2, 29) (2, 30)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300610 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000611
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300612 def test_method(self):
613 # Methods
614 self.check_tokenize("@staticmethod\ndef foo(x,y): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000615 OP '@' (1, 0) (1, 1)
Eric V. Smith67317742015-10-16 20:45:53 -0400616 NAME 'staticmethod' (1, 1) (1, 13)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000617 NEWLINE '\\n' (1, 13) (1, 14)
618 NAME 'def' (2, 0) (2, 3)
619 NAME 'foo' (2, 4) (2, 7)
620 OP '(' (2, 7) (2, 8)
621 NAME 'x' (2, 8) (2, 9)
622 OP ',' (2, 9) (2, 10)
623 NAME 'y' (2, 10) (2, 11)
624 OP ')' (2, 11) (2, 12)
625 OP ':' (2, 12) (2, 13)
626 NAME 'pass' (2, 14) (2, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300627 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000628
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300629 def test_tabs(self):
630 # Evil tabs
631 self.check_tokenize("def f():\n"
632 "\tif x\n"
633 " \tpass", """\
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000634 NAME 'def' (1, 0) (1, 3)
635 NAME 'f' (1, 4) (1, 5)
636 OP '(' (1, 5) (1, 6)
637 OP ')' (1, 6) (1, 7)
638 OP ':' (1, 7) (1, 8)
639 NEWLINE '\\n' (1, 8) (1, 9)
640 INDENT '\\t' (2, 0) (2, 1)
641 NAME 'if' (2, 1) (2, 3)
642 NAME 'x' (2, 4) (2, 5)
643 NEWLINE '\\n' (2, 5) (2, 6)
644 INDENT ' \\t' (3, 0) (3, 9)
645 NAME 'pass' (3, 9) (3, 13)
646 DEDENT '' (4, 0) (4, 0)
647 DEDENT '' (4, 0) (4, 0)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300648 """)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000649
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300650 def test_non_ascii_identifiers(self):
651 # Non-ascii identifiers
652 self.check_tokenize("Örter = 'places'\ngrün = 'green'", """\
Benjamin Peterson33856de2010-08-30 14:41:20 +0000653 NAME 'Örter' (1, 0) (1, 5)
654 OP '=' (1, 6) (1, 7)
655 STRING "'places'" (1, 8) (1, 16)
656 NEWLINE '\\n' (1, 16) (1, 17)
657 NAME 'grün' (2, 0) (2, 4)
658 OP '=' (2, 5) (2, 6)
659 STRING "'green'" (2, 7) (2, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300660 """)
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000661
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300662 def test_unicode(self):
663 # Legacy unicode literals:
664 self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000665 NAME 'Örter' (1, 0) (1, 5)
666 OP '=' (1, 6) (1, 7)
667 STRING "u'places'" (1, 8) (1, 17)
668 NEWLINE '\\n' (1, 17) (1, 18)
669 NAME 'grün' (2, 0) (2, 4)
670 OP '=' (2, 5) (2, 6)
Christian Heimes0b3847d2012-06-20 11:17:58 +0200671 STRING "U'green'" (2, 7) (2, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300672 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400673
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300674 def test_async(self):
675 # Async/await extension:
676 self.check_tokenize("async = 1", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400677 NAME 'async' (1, 0) (1, 5)
678 OP '=' (1, 6) (1, 7)
679 NUMBER '1' (1, 8) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300680 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400681
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300682 self.check_tokenize("a = (async = 1)", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400683 NAME 'a' (1, 0) (1, 1)
684 OP '=' (1, 2) (1, 3)
685 OP '(' (1, 4) (1, 5)
686 NAME 'async' (1, 5) (1, 10)
687 OP '=' (1, 11) (1, 12)
688 NUMBER '1' (1, 13) (1, 14)
689 OP ')' (1, 14) (1, 15)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300690 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400691
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300692 self.check_tokenize("async()", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400693 NAME 'async' (1, 0) (1, 5)
694 OP '(' (1, 5) (1, 6)
695 OP ')' (1, 6) (1, 7)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300696 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400697
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300698 self.check_tokenize("class async(Bar):pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400699 NAME 'class' (1, 0) (1, 5)
700 NAME 'async' (1, 6) (1, 11)
701 OP '(' (1, 11) (1, 12)
702 NAME 'Bar' (1, 12) (1, 15)
703 OP ')' (1, 15) (1, 16)
704 OP ':' (1, 16) (1, 17)
705 NAME 'pass' (1, 17) (1, 21)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300706 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400707
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300708 self.check_tokenize("class async:pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400709 NAME 'class' (1, 0) (1, 5)
710 NAME 'async' (1, 6) (1, 11)
711 OP ':' (1, 11) (1, 12)
712 NAME 'pass' (1, 12) (1, 16)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300713 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400714
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300715 self.check_tokenize("await = 1", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400716 NAME 'await' (1, 0) (1, 5)
717 OP '=' (1, 6) (1, 7)
718 NUMBER '1' (1, 8) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300719 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400720
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300721 self.check_tokenize("foo.async", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400722 NAME 'foo' (1, 0) (1, 3)
723 OP '.' (1, 3) (1, 4)
724 NAME 'async' (1, 4) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300725 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400726
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300727 self.check_tokenize("async for a in b: pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400728 NAME 'async' (1, 0) (1, 5)
729 NAME 'for' (1, 6) (1, 9)
730 NAME 'a' (1, 10) (1, 11)
731 NAME 'in' (1, 12) (1, 14)
732 NAME 'b' (1, 15) (1, 16)
733 OP ':' (1, 16) (1, 17)
734 NAME 'pass' (1, 18) (1, 22)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300735 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400736
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300737 self.check_tokenize("async with a as b: pass", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400738 NAME 'async' (1, 0) (1, 5)
739 NAME 'with' (1, 6) (1, 10)
740 NAME 'a' (1, 11) (1, 12)
741 NAME 'as' (1, 13) (1, 15)
742 NAME 'b' (1, 16) (1, 17)
743 OP ':' (1, 17) (1, 18)
744 NAME 'pass' (1, 19) (1, 23)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300745 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400746
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300747 self.check_tokenize("async.foo", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400748 NAME 'async' (1, 0) (1, 5)
749 OP '.' (1, 5) (1, 6)
750 NAME 'foo' (1, 6) (1, 9)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300751 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400752
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300753 self.check_tokenize("async", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400754 NAME 'async' (1, 0) (1, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300755 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400756
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300757 self.check_tokenize("async\n#comment\nawait", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400758 NAME 'async' (1, 0) (1, 5)
759 NEWLINE '\\n' (1, 5) (1, 6)
760 COMMENT '#comment' (2, 0) (2, 8)
761 NL '\\n' (2, 8) (2, 9)
762 NAME 'await' (3, 0) (3, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300763 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400764
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300765 self.check_tokenize("async\n...\nawait", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400766 NAME 'async' (1, 0) (1, 5)
767 NEWLINE '\\n' (1, 5) (1, 6)
768 OP '...' (2, 0) (2, 3)
769 NEWLINE '\\n' (2, 3) (2, 4)
770 NAME 'await' (3, 0) (3, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300771 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400772
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300773 self.check_tokenize("async\nawait", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400774 NAME 'async' (1, 0) (1, 5)
775 NEWLINE '\\n' (1, 5) (1, 6)
776 NAME 'await' (2, 0) (2, 5)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300777 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400778
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300779 self.check_tokenize("foo.async + 1", """\
Yury Selivanov75445082015-05-11 22:57:16 -0400780 NAME 'foo' (1, 0) (1, 3)
781 OP '.' (1, 3) (1, 4)
782 NAME 'async' (1, 4) (1, 9)
783 OP '+' (1, 10) (1, 11)
784 NUMBER '1' (1, 12) (1, 13)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300785 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400786
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300787 self.check_tokenize("async def foo(): pass", """\
Jelle Zijlstraac317702017-10-05 20:24:46 -0700788 NAME 'async' (1, 0) (1, 5)
Yury Selivanov75445082015-05-11 22:57:16 -0400789 NAME 'def' (1, 6) (1, 9)
790 NAME 'foo' (1, 10) (1, 13)
791 OP '(' (1, 13) (1, 14)
792 OP ')' (1, 14) (1, 15)
793 OP ':' (1, 15) (1, 16)
794 NAME 'pass' (1, 17) (1, 21)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300795 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400796
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300797 self.check_tokenize('''\
798async def foo():
799 def foo(await):
800 await = 1
801 if 1:
802 await
803async += 1
804''', """\
Jelle Zijlstraac317702017-10-05 20:24:46 -0700805 NAME 'async' (1, 0) (1, 5)
Yury Selivanov75445082015-05-11 22:57:16 -0400806 NAME 'def' (1, 6) (1, 9)
807 NAME 'foo' (1, 10) (1, 13)
808 OP '(' (1, 13) (1, 14)
809 OP ')' (1, 14) (1, 15)
810 OP ':' (1, 15) (1, 16)
811 NEWLINE '\\n' (1, 16) (1, 17)
812 INDENT ' ' (2, 0) (2, 2)
813 NAME 'def' (2, 2) (2, 5)
814 NAME 'foo' (2, 6) (2, 9)
815 OP '(' (2, 9) (2, 10)
Jelle Zijlstraac317702017-10-05 20:24:46 -0700816 NAME 'await' (2, 10) (2, 15)
Yury Selivanov75445082015-05-11 22:57:16 -0400817 OP ')' (2, 15) (2, 16)
818 OP ':' (2, 16) (2, 17)
819 NEWLINE '\\n' (2, 17) (2, 18)
820 INDENT ' ' (3, 0) (3, 4)
Jelle Zijlstraac317702017-10-05 20:24:46 -0700821 NAME 'await' (3, 4) (3, 9)
Yury Selivanov75445082015-05-11 22:57:16 -0400822 OP '=' (3, 10) (3, 11)
823 NUMBER '1' (3, 12) (3, 13)
824 NEWLINE '\\n' (3, 13) (3, 14)
825 DEDENT '' (4, 2) (4, 2)
826 NAME 'if' (4, 2) (4, 4)
827 NUMBER '1' (4, 5) (4, 6)
828 OP ':' (4, 6) (4, 7)
829 NEWLINE '\\n' (4, 7) (4, 8)
830 INDENT ' ' (5, 0) (5, 4)
Jelle Zijlstraac317702017-10-05 20:24:46 -0700831 NAME 'await' (5, 4) (5, 9)
Yury Selivanov75445082015-05-11 22:57:16 -0400832 NEWLINE '\\n' (5, 9) (5, 10)
833 DEDENT '' (6, 0) (6, 0)
834 DEDENT '' (6, 0) (6, 0)
835 NAME 'async' (6, 0) (6, 5)
836 OP '+=' (6, 6) (6, 8)
837 NUMBER '1' (6, 9) (6, 10)
838 NEWLINE '\\n' (6, 10) (6, 11)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300839 """)
Yury Selivanov75445082015-05-11 22:57:16 -0400840
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300841 self.check_tokenize('''\
842async def foo():
843 async for i in 1: pass''', """\
Jelle Zijlstraac317702017-10-05 20:24:46 -0700844 NAME 'async' (1, 0) (1, 5)
Yury Selivanov75445082015-05-11 22:57:16 -0400845 NAME 'def' (1, 6) (1, 9)
846 NAME 'foo' (1, 10) (1, 13)
847 OP '(' (1, 13) (1, 14)
848 OP ')' (1, 14) (1, 15)
849 OP ':' (1, 15) (1, 16)
850 NEWLINE '\\n' (1, 16) (1, 17)
851 INDENT ' ' (2, 0) (2, 2)
Jelle Zijlstraac317702017-10-05 20:24:46 -0700852 NAME 'async' (2, 2) (2, 7)
Yury Selivanov75445082015-05-11 22:57:16 -0400853 NAME 'for' (2, 8) (2, 11)
854 NAME 'i' (2, 12) (2, 13)
855 NAME 'in' (2, 14) (2, 16)
856 NUMBER '1' (2, 17) (2, 18)
857 OP ':' (2, 18) (2, 19)
858 NAME 'pass' (2, 20) (2, 24)
859 DEDENT '' (3, 0) (3, 0)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300860 """)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300861
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300862 self.check_tokenize('''async def foo(async): await''', """\
Jelle Zijlstraac317702017-10-05 20:24:46 -0700863 NAME 'async' (1, 0) (1, 5)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300864 NAME 'def' (1, 6) (1, 9)
865 NAME 'foo' (1, 10) (1, 13)
866 OP '(' (1, 13) (1, 14)
Jelle Zijlstraac317702017-10-05 20:24:46 -0700867 NAME 'async' (1, 14) (1, 19)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300868 OP ')' (1, 19) (1, 20)
869 OP ':' (1, 20) (1, 21)
Jelle Zijlstraac317702017-10-05 20:24:46 -0700870 NAME 'await' (1, 22) (1, 27)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300871 """)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300872
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300873 self.check_tokenize('''\
874def f():
875
876 def baz(): pass
877 async def bar(): pass
878
879 await = 2''', """\
Yury Selivanov96ec9342015-07-23 15:01:58 +0300880 NAME 'def' (1, 0) (1, 3)
881 NAME 'f' (1, 4) (1, 5)
882 OP '(' (1, 5) (1, 6)
883 OP ')' (1, 6) (1, 7)
884 OP ':' (1, 7) (1, 8)
885 NEWLINE '\\n' (1, 8) (1, 9)
886 NL '\\n' (2, 0) (2, 1)
887 INDENT ' ' (3, 0) (3, 2)
888 NAME 'def' (3, 2) (3, 5)
889 NAME 'baz' (3, 6) (3, 9)
890 OP '(' (3, 9) (3, 10)
891 OP ')' (3, 10) (3, 11)
892 OP ':' (3, 11) (3, 12)
893 NAME 'pass' (3, 13) (3, 17)
894 NEWLINE '\\n' (3, 17) (3, 18)
Jelle Zijlstraac317702017-10-05 20:24:46 -0700895 NAME 'async' (4, 2) (4, 7)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300896 NAME 'def' (4, 8) (4, 11)
897 NAME 'bar' (4, 12) (4, 15)
898 OP '(' (4, 15) (4, 16)
899 OP ')' (4, 16) (4, 17)
900 OP ':' (4, 17) (4, 18)
901 NAME 'pass' (4, 19) (4, 23)
902 NEWLINE '\\n' (4, 23) (4, 24)
903 NL '\\n' (5, 0) (5, 1)
904 NAME 'await' (6, 2) (6, 7)
905 OP '=' (6, 8) (6, 9)
906 NUMBER '2' (6, 10) (6, 11)
907 DEDENT '' (7, 0) (7, 0)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300908 """)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300909
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300910 self.check_tokenize('''\
911async def f():
912
913 def baz(): pass
914 async def bar(): pass
915
916 await = 2''', """\
Jelle Zijlstraac317702017-10-05 20:24:46 -0700917 NAME 'async' (1, 0) (1, 5)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300918 NAME 'def' (1, 6) (1, 9)
919 NAME 'f' (1, 10) (1, 11)
920 OP '(' (1, 11) (1, 12)
921 OP ')' (1, 12) (1, 13)
922 OP ':' (1, 13) (1, 14)
923 NEWLINE '\\n' (1, 14) (1, 15)
924 NL '\\n' (2, 0) (2, 1)
925 INDENT ' ' (3, 0) (3, 2)
926 NAME 'def' (3, 2) (3, 5)
927 NAME 'baz' (3, 6) (3, 9)
928 OP '(' (3, 9) (3, 10)
929 OP ')' (3, 10) (3, 11)
930 OP ':' (3, 11) (3, 12)
931 NAME 'pass' (3, 13) (3, 17)
932 NEWLINE '\\n' (3, 17) (3, 18)
Jelle Zijlstraac317702017-10-05 20:24:46 -0700933 NAME 'async' (4, 2) (4, 7)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300934 NAME 'def' (4, 8) (4, 11)
935 NAME 'bar' (4, 12) (4, 15)
936 OP '(' (4, 15) (4, 16)
937 OP ')' (4, 16) (4, 17)
938 OP ':' (4, 17) (4, 18)
939 NAME 'pass' (4, 19) (4, 23)
940 NEWLINE '\\n' (4, 23) (4, 24)
941 NL '\\n' (5, 0) (5, 1)
Jelle Zijlstraac317702017-10-05 20:24:46 -0700942 NAME 'await' (6, 2) (6, 7)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300943 OP '=' (6, 8) (6, 9)
944 NUMBER '2' (6, 10) (6, 11)
945 DEDENT '' (7, 0) (7, 0)
Serhiy Storchaka6f5175d2015-10-06 18:23:12 +0300946 """)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000947
Thomas Kluyverc56b17b2018-06-05 19:26:39 +0200948class GenerateTokensTest(TokenizeTest):
949 def check_tokenize(self, s, expected):
950 # Format the tokens in s in a table format.
Ammar Askarc4ef4892018-07-06 03:19:08 -0400951 # The ENDMARKER and final NEWLINE are omitted.
Thomas Kluyverc56b17b2018-06-05 19:26:39 +0200952 f = StringIO(s)
Ammar Askarc4ef4892018-07-06 03:19:08 -0400953 result = stringify_tokens_from_source(generate_tokens(f.readline), s)
Thomas Kluyverc56b17b2018-06-05 19:26:39 +0200954 self.assertEqual(result, expected.rstrip().splitlines())
955
Raymond Hettinger68c04532005-06-10 11:05:19 +0000956
Raymond Hettinger68c04532005-06-10 11:05:19 +0000957def decistmt(s):
Raymond Hettinger68c04532005-06-10 11:05:19 +0000958 result = []
Trent Nelson428de652008-03-18 22:41:35 +0000959 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
Raymond Hettinger68c04532005-06-10 11:05:19 +0000960 for toknum, tokval, _, _, _ in g:
961 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
962 result.extend([
963 (NAME, 'Decimal'),
964 (OP, '('),
965 (STRING, repr(tokval)),
966 (OP, ')')
967 ])
968 else:
969 result.append((toknum, tokval))
Trent Nelson428de652008-03-18 22:41:35 +0000970 return untokenize(result).decode('utf-8')
971
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300972class TestMisc(TestCase):
973
974 def test_decistmt(self):
975 # Substitute Decimals for floats in a string of statements.
976 # This is an example from the docs.
977
978 from decimal import Decimal
979 s = '+21.3e-5*-.1234/81.7'
980 self.assertEqual(decistmt(s),
981 "+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')")
982
983 # The format of the exponent is inherited from the platform C library.
984 # Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
985 # we're only showing 11 digits, and the 12th isn't close to 5, the
986 # rest of the output should be platform-independent.
987 self.assertRegex(repr(eval(s)), '-3.2171603427[0-9]*e-0+7')
988
989 # Output from calculations with Decimal should be identical across all
990 # platforms.
991 self.assertEqual(eval(decistmt(s)),
992 Decimal('-3.217160342717258261933904529E-7'))
993
Trent Nelson428de652008-03-18 22:41:35 +0000994
995class TestTokenizerAdheresToPep0263(TestCase):
996 """
997 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
998 """
999
1000 def _testFile(self, filename):
1001 path = os.path.join(os.path.dirname(__file__), filename)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001002 TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
Trent Nelson428de652008-03-18 22:41:35 +00001003
1004 def test_utf8_coding_cookie_and_no_utf8_bom(self):
Ned Deily2ea6fcc2011-07-19 16:15:27 -07001005 f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001006 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +00001007
1008 def test_latin1_coding_cookie_and_utf8_bom(self):
1009 """
1010 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
1011 allowed encoding for the comment is 'utf-8'. The text file used in
1012 this test starts with a BOM signature, but specifies latin1 as the
1013 coding, so verify that a SyntaxError is raised, which matches the
1014 behaviour of the interpreter when it encounters a similar condition.
1015 """
1016 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001017 self.assertRaises(SyntaxError, self._testFile, f)
Trent Nelson428de652008-03-18 22:41:35 +00001018
1019 def test_no_coding_cookie_and_utf8_bom(self):
1020 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001021 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +00001022
1023 def test_utf8_coding_cookie_and_utf8_bom(self):
1024 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001025 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +00001026
Florent Xicluna11f0b412012-07-07 12:13:35 +02001027 def test_bad_coding_cookie(self):
1028 self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')
1029 self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')
1030
Trent Nelson428de652008-03-18 22:41:35 +00001031
1032class Test_Tokenize(TestCase):
1033
1034 def test__tokenize_decodes_with_specified_encoding(self):
1035 literal = '"ЉЊЈЁЂ"'
1036 line = literal.encode('utf-8')
1037 first = False
1038 def readline():
1039 nonlocal first
1040 if not first:
1041 first = True
1042 return line
1043 else:
1044 return b''
1045
Ammar Askarc4ef4892018-07-06 03:19:08 -04001046 # skip the initial encoding token and the end tokens
1047 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
Trent Nelson428de652008-03-18 22:41:35 +00001048 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +00001049 self.assertEqual(tokens, expected_tokens,
1050 "bytes not decoded with encoding")
Trent Nelson428de652008-03-18 22:41:35 +00001051
1052 def test__tokenize_does_not_decode_with_encoding_none(self):
1053 literal = '"ЉЊЈЁЂ"'
1054 first = False
1055 def readline():
1056 nonlocal first
1057 if not first:
1058 first = True
1059 return literal
1060 else:
1061 return b''
1062
Ammar Askarc4ef4892018-07-06 03:19:08 -04001063 # skip the end tokens
1064 tokens = list(_tokenize(readline, encoding=None))[:-2]
Trent Nelson428de652008-03-18 22:41:35 +00001065 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +00001066 self.assertEqual(tokens, expected_tokens,
1067 "string not tokenized when encoding is None")
Trent Nelson428de652008-03-18 22:41:35 +00001068
1069
1070class TestDetectEncoding(TestCase):
1071
1072 def get_readline(self, lines):
1073 index = 0
1074 def readline():
1075 nonlocal index
1076 if index == len(lines):
1077 raise StopIteration
1078 line = lines[index]
1079 index += 1
1080 return line
1081 return readline
1082
1083 def test_no_bom_no_encoding_cookie(self):
1084 lines = (
1085 b'# something\n',
1086 b'print(something)\n',
1087 b'do_something(else)\n'
1088 )
1089 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001090 self.assertEqual(encoding, 'utf-8')
1091 self.assertEqual(consumed_lines, list(lines[:2]))
Trent Nelson428de652008-03-18 22:41:35 +00001092
1093 def test_bom_no_cookie(self):
1094 lines = (
1095 b'\xef\xbb\xbf# something\n',
1096 b'print(something)\n',
1097 b'do_something(else)\n'
1098 )
1099 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001100 self.assertEqual(encoding, 'utf-8-sig')
1101 self.assertEqual(consumed_lines,
1102 [b'# something\n', b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001103
1104 def test_cookie_first_line_no_bom(self):
1105 lines = (
1106 b'# -*- coding: latin-1 -*-\n',
1107 b'print(something)\n',
1108 b'do_something(else)\n'
1109 )
1110 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001111 self.assertEqual(encoding, 'iso-8859-1')
1112 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001113
1114 def test_matched_bom_and_cookie_first_line(self):
1115 lines = (
1116 b'\xef\xbb\xbf# coding=utf-8\n',
1117 b'print(something)\n',
1118 b'do_something(else)\n'
1119 )
1120 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001121 self.assertEqual(encoding, 'utf-8-sig')
1122 self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001123
1124 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
1125 lines = (
1126 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
1127 b'print(something)\n',
1128 b'do_something(else)\n'
1129 )
1130 readline = self.get_readline(lines)
1131 self.assertRaises(SyntaxError, detect_encoding, readline)
1132
1133 def test_cookie_second_line_no_bom(self):
1134 lines = (
1135 b'#! something\n',
1136 b'# vim: set fileencoding=ascii :\n',
1137 b'print(something)\n',
1138 b'do_something(else)\n'
1139 )
1140 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001141 self.assertEqual(encoding, 'ascii')
Trent Nelson428de652008-03-18 22:41:35 +00001142 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
Ezio Melottib3aedd42010-11-20 19:04:17 +00001143 self.assertEqual(consumed_lines, expected)
Trent Nelson428de652008-03-18 22:41:35 +00001144
1145 def test_matched_bom_and_cookie_second_line(self):
1146 lines = (
1147 b'\xef\xbb\xbf#! something\n',
1148 b'f# coding=utf-8\n',
1149 b'print(something)\n',
1150 b'do_something(else)\n'
1151 )
1152 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001153 self.assertEqual(encoding, 'utf-8-sig')
1154 self.assertEqual(consumed_lines,
1155 [b'#! something\n', b'f# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001156
1157 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
1158 lines = (
1159 b'\xef\xbb\xbf#! something\n',
1160 b'# vim: set fileencoding=ascii :\n',
1161 b'print(something)\n',
1162 b'do_something(else)\n'
1163 )
1164 readline = self.get_readline(lines)
1165 self.assertRaises(SyntaxError, detect_encoding, readline)
1166
Serhiy Storchaka768c16c2014-01-09 18:36:09 +02001167 def test_cookie_second_line_noncommented_first_line(self):
1168 lines = (
1169 b"print('\xc2\xa3')\n",
1170 b'# vim: set fileencoding=iso8859-15 :\n',
1171 b"print('\xe2\x82\xac')\n"
1172 )
1173 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1174 self.assertEqual(encoding, 'utf-8')
1175 expected = [b"print('\xc2\xa3')\n"]
1176 self.assertEqual(consumed_lines, expected)
1177
1178 def test_cookie_second_line_commented_first_line(self):
1179 lines = (
1180 b"#print('\xc2\xa3')\n",
1181 b'# vim: set fileencoding=iso8859-15 :\n',
1182 b"print('\xe2\x82\xac')\n"
1183 )
1184 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1185 self.assertEqual(encoding, 'iso8859-15')
1186 expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
1187 self.assertEqual(consumed_lines, expected)
1188
1189 def test_cookie_second_line_empty_first_line(self):
1190 lines = (
1191 b'\n',
1192 b'# vim: set fileencoding=iso8859-15 :\n',
1193 b"print('\xe2\x82\xac')\n"
1194 )
1195 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1196 self.assertEqual(encoding, 'iso8859-15')
1197 expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
1198 self.assertEqual(consumed_lines, expected)
1199
Benjamin Petersond3afada2009-10-09 21:43:09 +00001200 def test_latin1_normalization(self):
1201 # See get_normal_name() in tokenizer.c.
1202 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
1203 "iso-8859-1-unix", "iso-latin-1-mac")
1204 for encoding in encodings:
1205 for rep in ("-", "_"):
1206 enc = encoding.replace("-", rep)
1207 lines = (b"#!/usr/bin/python\n",
1208 b"# coding: " + enc.encode("ascii") + b"\n",
1209 b"print(things)\n",
1210 b"do_something += 4\n")
1211 rl = self.get_readline(lines)
1212 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001213 self.assertEqual(found, "iso-8859-1")
Benjamin Petersond3afada2009-10-09 21:43:09 +00001214
Martin v. Löwis63674f42012-04-20 14:36:47 +02001215 def test_syntaxerror_latin1(self):
1216 # Issue 14629: need to raise SyntaxError if the first
1217 # line(s) have non-UTF-8 characters
1218 lines = (
1219 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1220 )
1221 readline = self.get_readline(lines)
1222 self.assertRaises(SyntaxError, detect_encoding, readline)
1223
1224
Benjamin Petersond3afada2009-10-09 21:43:09 +00001225 def test_utf8_normalization(self):
1226 # See get_normal_name() in tokenizer.c.
1227 encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
1228 for encoding in encodings:
1229 for rep in ("-", "_"):
1230 enc = encoding.replace("-", rep)
1231 lines = (b"#!/usr/bin/python\n",
1232 b"# coding: " + enc.encode("ascii") + b"\n",
1233 b"1 + 3\n")
1234 rl = self.get_readline(lines)
1235 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001236 self.assertEqual(found, "utf-8")
Benjamin Petersond3afada2009-10-09 21:43:09 +00001237
Trent Nelson428de652008-03-18 22:41:35 +00001238 def test_short_files(self):
1239 readline = self.get_readline((b'print(something)\n',))
1240 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001241 self.assertEqual(encoding, 'utf-8')
1242 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001243
1244 encoding, consumed_lines = detect_encoding(self.get_readline(()))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001245 self.assertEqual(encoding, 'utf-8')
1246 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +00001247
1248 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
1249 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001250 self.assertEqual(encoding, 'utf-8-sig')
1251 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001252
1253 readline = self.get_readline((b'\xef\xbb\xbf',))
1254 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001255 self.assertEqual(encoding, 'utf-8-sig')
1256 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +00001257
Benjamin Peterson433f32c2008-12-12 01:25:05 +00001258 readline = self.get_readline((b'# coding: bad\n',))
1259 self.assertRaises(SyntaxError, detect_encoding, readline)
Trent Nelson428de652008-03-18 22:41:35 +00001260
Serhiy Storchakadafea852013-09-16 23:51:56 +03001261 def test_false_encoding(self):
1262 # Issue 18873: "Encoding" detected in non-comment lines
1263 readline = self.get_readline((b'print("#coding=fake")',))
1264 encoding, consumed_lines = detect_encoding(readline)
1265 self.assertEqual(encoding, 'utf-8')
1266 self.assertEqual(consumed_lines, [b'print("#coding=fake")'])
1267
Victor Stinner58c07522010-11-09 01:08:59 +00001268 def test_open(self):
Hai Shi46605972020-08-04 00:49:18 +08001269 filename = os_helper.TESTFN + '.py'
1270 self.addCleanup(os_helper.unlink, filename)
Victor Stinner58c07522010-11-09 01:08:59 +00001271
1272 # test coding cookie
1273 for encoding in ('iso-8859-15', 'utf-8'):
1274 with open(filename, 'w', encoding=encoding) as fp:
1275 print("# coding: %s" % encoding, file=fp)
1276 print("print('euro:\u20ac')", file=fp)
1277 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001278 self.assertEqual(fp.encoding, encoding)
1279 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001280
1281 # test BOM (no coding cookie)
1282 with open(filename, 'w', encoding='utf-8-sig') as fp:
1283 print("print('euro:\u20ac')", file=fp)
1284 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001285 self.assertEqual(fp.encoding, 'utf-8-sig')
1286 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001287
Brett Cannonc33f3f22012-04-20 13:23:54 -04001288 def test_filename_in_exception(self):
1289 # When possible, include the file name in the exception.
1290 path = 'some_file_path'
1291 lines = (
1292 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1293 )
1294 class Bunk:
1295 def __init__(self, lines, path):
1296 self.name = path
1297 self._lines = lines
1298 self._index = 0
1299
1300 def readline(self):
1301 if self._index == len(lines):
1302 raise StopIteration
1303 line = lines[self._index]
1304 self._index += 1
1305 return line
1306
1307 with self.assertRaises(SyntaxError):
1308 ins = Bunk(lines, path)
1309 # Make sure lacking a name isn't an issue.
1310 del ins.name
1311 detect_encoding(ins.readline)
1312 with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
1313 ins = Bunk(lines, path)
1314 detect_encoding(ins.readline)
1315
Victor Stinner387729e2015-05-26 00:43:58 +02001316 def test_open_error(self):
1317 # Issue #23840: open() must close the binary file on error
1318 m = BytesIO(b'#coding:xxx')
1319 with mock.patch('tokenize._builtin_open', return_value=m):
1320 self.assertRaises(SyntaxError, tokenize_open, 'foobar')
1321 self.assertTrue(m.closed)
1322
1323
Trent Nelson428de652008-03-18 22:41:35 +00001324class TestTokenize(TestCase):
1325
1326 def test_tokenize(self):
1327 import tokenize as tokenize_module
1328 encoding = object()
1329 encoding_used = None
1330 def mock_detect_encoding(readline):
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001331 return encoding, [b'first', b'second']
Trent Nelson428de652008-03-18 22:41:35 +00001332
1333 def mock__tokenize(readline, encoding):
1334 nonlocal encoding_used
1335 encoding_used = encoding
1336 out = []
1337 while True:
1338 next_line = readline()
1339 if next_line:
1340 out.append(next_line)
1341 continue
1342 return out
1343
1344 counter = 0
1345 def mock_readline():
1346 nonlocal counter
1347 counter += 1
1348 if counter == 5:
1349 return b''
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001350 return str(counter).encode()
Trent Nelson428de652008-03-18 22:41:35 +00001351
1352 orig_detect_encoding = tokenize_module.detect_encoding
1353 orig__tokenize = tokenize_module._tokenize
1354 tokenize_module.detect_encoding = mock_detect_encoding
1355 tokenize_module._tokenize = mock__tokenize
1356 try:
1357 results = tokenize(mock_readline)
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001358 self.assertEqual(list(results),
1359 [b'first', b'second', b'1', b'2', b'3', b'4'])
Trent Nelson428de652008-03-18 22:41:35 +00001360 finally:
1361 tokenize_module.detect_encoding = orig_detect_encoding
1362 tokenize_module._tokenize = orig__tokenize
1363
Sergey Fedoseevb796e7d2018-07-09 20:25:55 +05001364 self.assertEqual(encoding_used, encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +00001365
Yury Selivanov8085b802015-05-18 12:50:52 -04001366 def test_oneline_defs(self):
1367 buf = []
1368 for i in range(500):
1369 buf.append('def i{i}(): return {i}'.format(i=i))
1370 buf.append('OK')
1371 buf = '\n'.join(buf)
1372
1373 # Test that 500 consequent, one-line defs is OK
1374 toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
Ammar Askarc4ef4892018-07-06 03:19:08 -04001375 self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER
1376 # [-2] is always NEWLINE
Yury Selivanov8085b802015-05-18 12:50:52 -04001377
Meador Inge00c7f852012-01-19 00:44:45 -06001378 def assertExactTypeEqual(self, opstr, *optypes):
1379 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
1380 num_optypes = len(optypes)
Ammar Askarc4ef4892018-07-06 03:19:08 -04001381 self.assertEqual(len(tokens), 3 + num_optypes)
Albert-Jan Nijburgfc354f02017-05-31 15:00:21 +01001382 self.assertEqual(tok_name[tokens[0].exact_type],
1383 tok_name[ENCODING])
Meador Inge00c7f852012-01-19 00:44:45 -06001384 for i in range(num_optypes):
Albert-Jan Nijburgfc354f02017-05-31 15:00:21 +01001385 self.assertEqual(tok_name[tokens[i + 1].exact_type],
1386 tok_name[optypes[i]])
1387 self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type],
Ammar Askarc4ef4892018-07-06 03:19:08 -04001388 tok_name[token.NEWLINE])
1389 self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type],
Albert-Jan Nijburgfc354f02017-05-31 15:00:21 +01001390 tok_name[token.ENDMARKER])
Meador Inge00c7f852012-01-19 00:44:45 -06001391
1392 def test_exact_type(self):
1393 self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
1394 self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
1395 self.assertExactTypeEqual(':', token.COLON)
1396 self.assertExactTypeEqual(',', token.COMMA)
1397 self.assertExactTypeEqual(';', token.SEMI)
1398 self.assertExactTypeEqual('+', token.PLUS)
1399 self.assertExactTypeEqual('-', token.MINUS)
1400 self.assertExactTypeEqual('*', token.STAR)
1401 self.assertExactTypeEqual('/', token.SLASH)
1402 self.assertExactTypeEqual('|', token.VBAR)
1403 self.assertExactTypeEqual('&', token.AMPER)
1404 self.assertExactTypeEqual('<', token.LESS)
1405 self.assertExactTypeEqual('>', token.GREATER)
1406 self.assertExactTypeEqual('=', token.EQUAL)
1407 self.assertExactTypeEqual('.', token.DOT)
1408 self.assertExactTypeEqual('%', token.PERCENT)
1409 self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
1410 self.assertExactTypeEqual('==', token.EQEQUAL)
1411 self.assertExactTypeEqual('!=', token.NOTEQUAL)
1412 self.assertExactTypeEqual('<=', token.LESSEQUAL)
1413 self.assertExactTypeEqual('>=', token.GREATEREQUAL)
1414 self.assertExactTypeEqual('~', token.TILDE)
1415 self.assertExactTypeEqual('^', token.CIRCUMFLEX)
1416 self.assertExactTypeEqual('<<', token.LEFTSHIFT)
1417 self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
1418 self.assertExactTypeEqual('**', token.DOUBLESTAR)
1419 self.assertExactTypeEqual('+=', token.PLUSEQUAL)
1420 self.assertExactTypeEqual('-=', token.MINEQUAL)
1421 self.assertExactTypeEqual('*=', token.STAREQUAL)
1422 self.assertExactTypeEqual('/=', token.SLASHEQUAL)
1423 self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
1424 self.assertExactTypeEqual('&=', token.AMPEREQUAL)
1425 self.assertExactTypeEqual('|=', token.VBAREQUAL)
1426 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1427 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1428 self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
1429 self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
1430 self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
1431 self.assertExactTypeEqual('//', token.DOUBLESLASH)
1432 self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
Emily Morehouse8f59ee02019-01-24 16:49:56 -07001433 self.assertExactTypeEqual(':=', token.COLONEQUAL)
Jim Fasarakis-Hilliardd4914e92017-03-14 22:16:15 +02001434 self.assertExactTypeEqual('...', token.ELLIPSIS)
1435 self.assertExactTypeEqual('->', token.RARROW)
Meador Inge00c7f852012-01-19 00:44:45 -06001436 self.assertExactTypeEqual('@', token.AT)
Benjamin Petersond51374e2014-04-09 23:55:56 -04001437 self.assertExactTypeEqual('@=', token.ATEQUAL)
Meador Inge00c7f852012-01-19 00:44:45 -06001438
1439 self.assertExactTypeEqual('a**2+b**2==c**2',
1440 NAME, token.DOUBLESTAR, NUMBER,
1441 token.PLUS,
1442 NAME, token.DOUBLESTAR, NUMBER,
1443 token.EQEQUAL,
1444 NAME, token.DOUBLESTAR, NUMBER)
1445 self.assertExactTypeEqual('{1, 2, 3}',
1446 token.LBRACE,
1447 token.NUMBER, token.COMMA,
1448 token.NUMBER, token.COMMA,
1449 token.NUMBER,
1450 token.RBRACE)
1451 self.assertExactTypeEqual('^(x & 0x1)',
1452 token.CIRCUMFLEX,
1453 token.LPAR,
1454 token.NAME, token.AMPER, token.NUMBER,
1455 token.RPAR)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001456
Ezio Melottifafa8b72012-11-03 17:46:51 +02001457 def test_pathological_trailing_whitespace(self):
1458 # See http://bugs.python.org/issue16152
1459 self.assertExactTypeEqual('@ ', token.AT)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001460
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001461
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001462class UntokenizeTest(TestCase):
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001463
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001464 def test_bad_input_order(self):
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001465 # raise if previous row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001466 u = Untokenizer()
1467 u.prev_row = 2
1468 u.prev_col = 2
1469 with self.assertRaises(ValueError) as cm:
1470 u.add_whitespace((1,3))
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001471 self.assertEqual(cm.exception.args[0],
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001472 'start (1,3) precedes previous end (2,2)')
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001473 # raise if previous column in row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001474 self.assertRaises(ValueError, u.add_whitespace, (2,1))
1475
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001476 def test_backslash_continuation(self):
1477 # The problem is that <whitespace>\<newline> leaves no token
1478 u = Untokenizer()
1479 u.prev_row = 1
1480 u.prev_col = 1
1481 u.tokens = []
1482 u.add_whitespace((2, 0))
1483 self.assertEqual(u.tokens, ['\\\n'])
1484 u.prev_row = 2
1485 u.add_whitespace((4, 4))
1486 self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' '])
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001487 TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n')
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001488
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -05001489 def test_iter_compat(self):
1490 u = Untokenizer()
1491 token = (NAME, 'Hello')
1492 tokens = [(ENCODING, 'utf-8'), token]
1493 u.compat(token, iter([]))
1494 self.assertEqual(u.tokens, ["Hello "])
1495 u = Untokenizer()
1496 self.assertEqual(u.untokenize(iter([token])), 'Hello ')
1497 u = Untokenizer()
1498 self.assertEqual(u.untokenize(iter(tokens)), 'Hello ')
1499 self.assertEqual(u.encoding, 'utf-8')
1500 self.assertEqual(untokenize(iter(tokens)), b'Hello ')
1501
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001502
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001503class TestRoundtrip(TestCase):
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001504
1505 def check_roundtrip(self, f):
1506 """
1507 Test roundtrip for `untokenize`. `f` is an open file or a string.
1508 The source code in f is tokenized to both 5- and 2-tuples.
1509 Both sequences are converted back to source code via
1510 tokenize.untokenize(), and the latter tokenized again to 2-tuples.
1511 The test fails if the 3 pair tokenizations do not match.
1512
1513 When untokenize bugs are fixed, untokenize with 5-tuples should
1514 reproduce code that does not contain a backslash continuation
1515 following spaces. A proper test should test this.
1516 """
1517 # Get source code and original tokenizations
1518 if isinstance(f, str):
1519 code = f.encode('utf-8')
1520 else:
1521 code = f.read()
1522 f.close()
1523 readline = iter(code.splitlines(keepends=True)).__next__
1524 tokens5 = list(tokenize(readline))
1525 tokens2 = [tok[:2] for tok in tokens5]
1526 # Reproduce tokens2 from pairs
1527 bytes_from2 = untokenize(tokens2)
1528 readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__
1529 tokens2_from2 = [tok[:2] for tok in tokenize(readline2)]
1530 self.assertEqual(tokens2_from2, tokens2)
1531 # Reproduce tokens2 from 5-tuples
1532 bytes_from5 = untokenize(tokens5)
1533 readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__
1534 tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
1535 self.assertEqual(tokens2_from5, tokens2)
1536
1537 def test_roundtrip(self):
1538 # There are some standard formatting practices that are easy to get right.
1539
1540 self.check_roundtrip("if x == 1:\n"
1541 " print(x)\n")
1542 self.check_roundtrip("# This is a comment\n"
Ammar Askarc4ef4892018-07-06 03:19:08 -04001543 "# This also\n")
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001544
1545 # Some people use different formatting conventions, which makes
1546 # untokenize a little trickier. Note that this test involves trailing
1547 # whitespace after the colon. Note that we use hex escapes to make the
1548 # two trailing blanks apparent in the expected output.
1549
1550 self.check_roundtrip("if x == 1 : \n"
1551 " print(x)\n")
1552 fn = support.findfile("tokenize_tests.txt")
1553 with open(fn, 'rb') as f:
1554 self.check_roundtrip(f)
1555 self.check_roundtrip("if x == 1:\n"
1556 " # A comment by itself.\n"
1557 " print(x) # Comment here, too.\n"
1558 " # Another comment.\n"
1559 "after_if = True\n")
1560 self.check_roundtrip("if (x # The comments need to go in the right place\n"
1561 " == 1):\n"
1562 " print('x==1')\n")
1563 self.check_roundtrip("class Test: # A comment here\n"
1564 " # A comment with weird indent\n"
1565 " after_com = 5\n"
1566 " def x(m): return m*5 # a one liner\n"
1567 " def y(m): # A whitespace after the colon\n"
1568 " return y*4 # 3-space indent\n")
1569
1570 # Some error-handling code
1571 self.check_roundtrip("try: import somemodule\n"
1572 "except ImportError: # comment\n"
1573 " print('Can not import' # comment2\n)"
1574 "else: print('Loaded')\n")
1575
1576 def test_continuation(self):
1577 # Balancing continuation
1578 self.check_roundtrip("a = (3,4, \n"
1579 "5,6)\n"
1580 "y = [3, 4,\n"
1581 "5]\n"
1582 "z = {'a': 5,\n"
1583 "'b':15, 'c':True}\n"
1584 "x = len(y) + 5 - a[\n"
1585 "3] - a[2]\n"
1586 "+ len(z) - z[\n"
1587 "'b']\n")
1588
1589 def test_backslash_continuation(self):
1590 # Backslash means line continuation, except for comments
1591 self.check_roundtrip("x=1+\\\n"
1592 "1\n"
1593 "# This is a comment\\\n"
1594 "# This also\n")
1595 self.check_roundtrip("# Comment \\\n"
1596 "x = 0")
1597
1598 def test_string_concatenation(self):
1599 # Two string literals on the same line
1600 self.check_roundtrip("'' ''")
1601
1602 def test_random_files(self):
1603 # Test roundtrip on random python modules.
1604 # pass the '-ucpu' option to process the full directory.
1605
1606 import glob, random
1607 fn = support.findfile("tokenize_tests.txt")
1608 tempdir = os.path.dirname(fn) or os.curdir
Serhiy Storchaka93558682020-06-20 11:10:31 +03001609 testfiles = glob.glob(os.path.join(glob.escape(tempdir), "test*.py"))
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001610
Brett Cannona721aba2016-09-09 14:57:09 -07001611 # Tokenize is broken on test_pep3131.py because regular expressions are
1612 # broken on the obscure unicode identifiers in it. *sigh*
1613 # With roundtrip extended to test the 5-tuple mode of untokenize,
1614 # 7 more testfiles fail. Remove them also until the failure is diagnosed.
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001615
Zachary Ware724f6a62016-09-09 12:55:37 -07001616 testfiles.remove(os.path.join(tempdir, "test_unicode_identifiers.py"))
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001617 for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'):
1618 testfiles.remove(os.path.join(tempdir, "test_%s.py") % f)
1619
1620 if not support.is_resource_enabled("cpu"):
1621 testfiles = random.sample(testfiles, 10)
1622
1623 for testfile in testfiles:
Serhiy Storchaka8ac65812018-12-22 11:18:40 +02001624 if support.verbose >= 2:
1625 print('tokenize', testfile)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001626 with open(testfile, 'rb') as f:
1627 with self.subTest(file=testfile):
1628 self.check_roundtrip(f)
1629
1630
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001631 def roundtrip(self, code):
1632 if isinstance(code, str):
1633 code = code.encode('utf-8')
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001634 return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001635
1636 def test_indentation_semantics_retained(self):
1637 """
1638 Ensure that although whitespace might be mutated in a roundtrip,
1639 the semantic meaning of the indentation remains consistent.
1640 """
1641 code = "if False:\n\tx=3\n\tx=3\n"
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001642 codelines = self.roundtrip(code).split('\n')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001643 self.assertEqual(codelines[1], codelines[2])
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001644 self.check_roundtrip(code)
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001645
1646
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001647if __name__ == "__main__":
Brett Cannona721aba2016-09-09 14:57:09 -07001648 unittest.main()