blob: 40b0c90dde608824c19d1b8f7be8d4fde5a4642b [file] [log] [blame]
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001from test import support
2from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
3 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
4 open as tokenize_open, Untokenizer)
5from io import BytesIO
6from unittest import TestCase, mock
7import os
8import token
Thomas Wouters89f507f2006-12-13 04:49:30 +00009
Thomas Wouters89f507f2006-12-13 04:49:30 +000010
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030011class TokenizeTest(TestCase):
12 # Tests for the tokenize module.
Jason R. Coombs7cf36382015-06-20 19:13:50 -040013
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030014 # The tests can be really simple. Given a small fragment of source
15 # code, print out a table with tokens. The ENDMARKER is omitted for
16 # brevity.
17
18 def check_tokenize(self, s, expected):
19 # Format the tokens in s in a table format.
20 # The ENDMARKER is omitted.
21 result = []
22 f = BytesIO(s.encode('utf-8'))
23 for type, token, start, end, line in tokenize(f.readline):
24 if type == ENDMARKER:
25 break
26 type = tok_name[type]
27 result.append(" %(type)-10.10s %(token)-13.13r %(start)s %(end)s" %
28 locals())
29 self.assertEqual(result,
30 [" ENCODING 'utf-8' (0, 0) (0, 0)"] +
31 expected.rstrip().splitlines())
32
33 def test_basic(self):
34 self.check_tokenize("1 + 1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000035 NUMBER '1' (1, 0) (1, 1)
36 OP '+' (1, 2) (1, 3)
37 NUMBER '1' (1, 4) (1, 5)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030038 """)
39 self.check_tokenize("if False:\n"
40 " # NL\n"
41 " True = False # NEWLINE\n", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000042 NAME 'if' (1, 0) (1, 2)
43 NAME 'False' (1, 3) (1, 8)
44 OP ':' (1, 8) (1, 9)
45 NEWLINE '\\n' (1, 9) (1, 10)
46 COMMENT '# NL' (2, 4) (2, 8)
47 NL '\\n' (2, 8) (2, 9)
48 INDENT ' ' (3, 0) (3, 4)
49 NAME 'True' (3, 4) (3, 8)
50 OP '=' (3, 9) (3, 10)
51 NAME 'False' (3, 11) (3, 16)
52 COMMENT '# NEWLINE' (3, 17) (3, 26)
53 NEWLINE '\\n' (3, 26) (3, 27)
54 DEDENT '' (4, 0) (4, 0)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030055 """)
56 indent_error_file = b"""\
57def k(x):
58 x += 2
59 x += 5
60"""
61 readline = BytesIO(indent_error_file).readline
62 with self.assertRaisesRegex(IndentationError,
63 "unindent does not match any "
64 "outer indentation level"):
65 for tok in tokenize(readline):
66 pass
Thomas Wouters89f507f2006-12-13 04:49:30 +000067
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030068 def test_int(self):
69 # Ordinary integers and binary operators
70 self.check_tokenize("0xff <= 255", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000071 NUMBER '0xff' (1, 0) (1, 4)
72 OP '<=' (1, 5) (1, 7)
73 NUMBER '255' (1, 8) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030074 """)
75 self.check_tokenize("0b10 <= 255", """\
Eric Smith74ca5572008-03-17 19:49:19 +000076 NUMBER '0b10' (1, 0) (1, 4)
77 OP '<=' (1, 5) (1, 7)
78 NUMBER '255' (1, 8) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030079 """)
80 self.check_tokenize("0o123 <= 0O123", """\
Eric Smith74ca5572008-03-17 19:49:19 +000081 NUMBER '0o123' (1, 0) (1, 5)
82 OP '<=' (1, 6) (1, 8)
83 NUMBER '0O123' (1, 9) (1, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030084 """)
85 self.check_tokenize("1234567 > ~0x15", """\
Mark Dickinson0c1f7c02008-03-16 05:05:12 +000086 NUMBER '1234567' (1, 0) (1, 7)
87 OP '>' (1, 8) (1, 9)
88 OP '~' (1, 10) (1, 11)
89 NUMBER '0x15' (1, 11) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030090 """)
91 self.check_tokenize("2134568 != 1231515", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000092 NUMBER '2134568' (1, 0) (1, 7)
93 OP '!=' (1, 8) (1, 10)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +000094 NUMBER '1231515' (1, 11) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +030095 """)
96 self.check_tokenize("(-124561-1) & 200000000", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +000097 OP '(' (1, 0) (1, 1)
98 OP '-' (1, 1) (1, 2)
99 NUMBER '124561' (1, 2) (1, 8)
100 OP '-' (1, 8) (1, 9)
101 NUMBER '1' (1, 9) (1, 10)
102 OP ')' (1, 10) (1, 11)
103 OP '&' (1, 12) (1, 13)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000104 NUMBER '200000000' (1, 14) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300105 """)
106 self.check_tokenize("0xdeadbeef != -1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000107 NUMBER '0xdeadbeef' (1, 0) (1, 10)
108 OP '!=' (1, 11) (1, 13)
109 OP '-' (1, 14) (1, 15)
110 NUMBER '1' (1, 15) (1, 16)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300111 """)
112 self.check_tokenize("0xdeadc0de & 12345", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000113 NUMBER '0xdeadc0de' (1, 0) (1, 10)
114 OP '&' (1, 11) (1, 12)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000115 NUMBER '12345' (1, 13) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300116 """)
117 self.check_tokenize("0xFF & 0x15 | 1234", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000118 NUMBER '0xFF' (1, 0) (1, 4)
119 OP '&' (1, 5) (1, 6)
120 NUMBER '0x15' (1, 7) (1, 11)
121 OP '|' (1, 12) (1, 13)
122 NUMBER '1234' (1, 14) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300123 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000124
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300125 def test_long(self):
126 # Long integers
127 self.check_tokenize("x = 0", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000128 NAME 'x' (1, 0) (1, 1)
129 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000130 NUMBER '0' (1, 4) (1, 5)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300131 """)
132 self.check_tokenize("x = 0xfffffffffff", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000133 NAME 'x' (1, 0) (1, 1)
134 OP '=' (1, 2) (1, 3)
135 NUMBER '0xffffffffff (1, 4) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300136 """)
137 self.check_tokenize("x = 123141242151251616110", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000138 NAME 'x' (1, 0) (1, 1)
139 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000140 NUMBER '123141242151 (1, 4) (1, 25)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300141 """)
142 self.check_tokenize("x = -15921590215012591", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000143 NAME 'x' (1, 0) (1, 1)
144 OP '=' (1, 2) (1, 3)
145 OP '-' (1, 4) (1, 5)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000146 NUMBER '159215902150 (1, 5) (1, 22)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300147 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000148
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300149 def test_float(self):
150 # Floating point numbers
151 self.check_tokenize("x = 3.14159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000152 NAME 'x' (1, 0) (1, 1)
153 OP '=' (1, 2) (1, 3)
154 NUMBER '3.14159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300155 """)
156 self.check_tokenize("x = 314159.", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000157 NAME 'x' (1, 0) (1, 1)
158 OP '=' (1, 2) (1, 3)
159 NUMBER '314159.' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300160 """)
161 self.check_tokenize("x = .314159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000162 NAME 'x' (1, 0) (1, 1)
163 OP '=' (1, 2) (1, 3)
164 NUMBER '.314159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300165 """)
166 self.check_tokenize("x = 3e14159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000167 NAME 'x' (1, 0) (1, 1)
168 OP '=' (1, 2) (1, 3)
169 NUMBER '3e14159' (1, 4) (1, 11)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300170 """)
171 self.check_tokenize("x = 3E123", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000172 NAME 'x' (1, 0) (1, 1)
173 OP '=' (1, 2) (1, 3)
174 NUMBER '3E123' (1, 4) (1, 9)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300175 """)
176 self.check_tokenize("x+y = 3e-1230", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000177 NAME 'x' (1, 0) (1, 1)
178 OP '+' (1, 1) (1, 2)
179 NAME 'y' (1, 2) (1, 3)
180 OP '=' (1, 4) (1, 5)
181 NUMBER '3e-1230' (1, 6) (1, 13)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300182 """)
183 self.check_tokenize("x = 3.14e159", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000184 NAME 'x' (1, 0) (1, 1)
185 OP '=' (1, 2) (1, 3)
186 NUMBER '3.14e159' (1, 4) (1, 12)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300187 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000188
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300189 def test_string(self):
190 # String literals
191 self.check_tokenize("x = ''; y = \"\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000192 NAME 'x' (1, 0) (1, 1)
193 OP '=' (1, 2) (1, 3)
194 STRING "''" (1, 4) (1, 6)
195 OP ';' (1, 6) (1, 7)
196 NAME 'y' (1, 8) (1, 9)
197 OP '=' (1, 10) (1, 11)
198 STRING '""' (1, 12) (1, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300199 """)
200 self.check_tokenize("x = '\"'; y = \"'\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000201 NAME 'x' (1, 0) (1, 1)
202 OP '=' (1, 2) (1, 3)
203 STRING '\\'"\\'' (1, 4) (1, 7)
204 OP ';' (1, 7) (1, 8)
205 NAME 'y' (1, 9) (1, 10)
206 OP '=' (1, 11) (1, 12)
207 STRING '"\\'"' (1, 13) (1, 16)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300208 """)
209 self.check_tokenize("x = \"doesn't \"shrink\", does it\"", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000210 NAME 'x' (1, 0) (1, 1)
211 OP '=' (1, 2) (1, 3)
212 STRING '"doesn\\'t "' (1, 4) (1, 14)
213 NAME 'shrink' (1, 14) (1, 20)
214 STRING '", does it"' (1, 20) (1, 31)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300215 """)
216 self.check_tokenize("x = 'abc' + 'ABC'", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000217 NAME 'x' (1, 0) (1, 1)
218 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000219 STRING "'abc'" (1, 4) (1, 9)
220 OP '+' (1, 10) (1, 11)
221 STRING "'ABC'" (1, 12) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300222 """)
223 self.check_tokenize('y = "ABC" + "ABC"', """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000224 NAME 'y' (1, 0) (1, 1)
225 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000226 STRING '"ABC"' (1, 4) (1, 9)
227 OP '+' (1, 10) (1, 11)
228 STRING '"ABC"' (1, 12) (1, 17)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300229 """)
230 self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000231 NAME 'x' (1, 0) (1, 1)
232 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000233 STRING "r'abc'" (1, 4) (1, 10)
234 OP '+' (1, 11) (1, 12)
235 STRING "r'ABC'" (1, 13) (1, 19)
236 OP '+' (1, 20) (1, 21)
237 STRING "R'ABC'" (1, 22) (1, 28)
238 OP '+' (1, 29) (1, 30)
239 STRING "R'ABC'" (1, 31) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300240 """)
241 self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000242 NAME 'y' (1, 0) (1, 1)
243 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000244 STRING 'r"abc"' (1, 4) (1, 10)
245 OP '+' (1, 11) (1, 12)
246 STRING 'r"ABC"' (1, 13) (1, 19)
247 OP '+' (1, 20) (1, 21)
248 STRING 'R"ABC"' (1, 22) (1, 28)
249 OP '+' (1, 29) (1, 30)
250 STRING 'R"ABC"' (1, 31) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300251 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000252
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300253 self.check_tokenize("u'abc' + U'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500254 STRING "u'abc'" (1, 0) (1, 6)
255 OP '+' (1, 7) (1, 8)
256 STRING "U'abc'" (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300257 """)
258 self.check_tokenize('u"abc" + U"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500259 STRING 'u"abc"' (1, 0) (1, 6)
260 OP '+' (1, 7) (1, 8)
261 STRING 'U"abc"' (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300262 """)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500263
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300264 self.check_tokenize("b'abc' + B'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500265 STRING "b'abc'" (1, 0) (1, 6)
266 OP '+' (1, 7) (1, 8)
267 STRING "B'abc'" (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300268 """)
269 self.check_tokenize('b"abc" + B"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500270 STRING 'b"abc"' (1, 0) (1, 6)
271 OP '+' (1, 7) (1, 8)
272 STRING 'B"abc"' (1, 9) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300273 """)
274 self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500275 STRING "br'abc'" (1, 0) (1, 7)
276 OP '+' (1, 8) (1, 9)
277 STRING "bR'abc'" (1, 10) (1, 17)
278 OP '+' (1, 18) (1, 19)
279 STRING "Br'abc'" (1, 20) (1, 27)
280 OP '+' (1, 28) (1, 29)
281 STRING "BR'abc'" (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300282 """)
283 self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500284 STRING 'br"abc"' (1, 0) (1, 7)
285 OP '+' (1, 8) (1, 9)
286 STRING 'bR"abc"' (1, 10) (1, 17)
287 OP '+' (1, 18) (1, 19)
288 STRING 'Br"abc"' (1, 20) (1, 27)
289 OP '+' (1, 28) (1, 29)
290 STRING 'BR"abc"' (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300291 """)
292 self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500293 STRING "rb'abc'" (1, 0) (1, 7)
294 OP '+' (1, 8) (1, 9)
295 STRING "rB'abc'" (1, 10) (1, 17)
296 OP '+' (1, 18) (1, 19)
297 STRING "Rb'abc'" (1, 20) (1, 27)
298 OP '+' (1, 28) (1, 29)
299 STRING "RB'abc'" (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300300 """)
301 self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\
Meador Inge8d5c0b82012-06-16 21:49:08 -0500302 STRING 'rb"abc"' (1, 0) (1, 7)
303 OP '+' (1, 8) (1, 9)
304 STRING 'rB"abc"' (1, 10) (1, 17)
305 OP '+' (1, 18) (1, 19)
306 STRING 'Rb"abc"' (1, 20) (1, 27)
307 OP '+' (1, 28) (1, 29)
308 STRING 'RB"abc"' (1, 30) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300309 """)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500310
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300311 def test_function(self):
312 self.check_tokenize("def d22(a, b, c=2, d=2, *k): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000313 NAME 'def' (1, 0) (1, 3)
314 NAME 'd22' (1, 4) (1, 7)
315 OP '(' (1, 7) (1, 8)
316 NAME 'a' (1, 8) (1, 9)
317 OP ',' (1, 9) (1, 10)
318 NAME 'b' (1, 11) (1, 12)
319 OP ',' (1, 12) (1, 13)
320 NAME 'c' (1, 14) (1, 15)
321 OP '=' (1, 15) (1, 16)
322 NUMBER '2' (1, 16) (1, 17)
323 OP ',' (1, 17) (1, 18)
324 NAME 'd' (1, 19) (1, 20)
325 OP '=' (1, 20) (1, 21)
326 NUMBER '2' (1, 21) (1, 22)
327 OP ',' (1, 22) (1, 23)
328 OP '*' (1, 24) (1, 25)
329 NAME 'k' (1, 25) (1, 26)
330 OP ')' (1, 26) (1, 27)
331 OP ':' (1, 27) (1, 28)
332 NAME 'pass' (1, 29) (1, 33)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300333 """)
334 self.check_tokenize("def d01v_(a=1, *k, **w): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000335 NAME 'def' (1, 0) (1, 3)
336 NAME 'd01v_' (1, 4) (1, 9)
337 OP '(' (1, 9) (1, 10)
338 NAME 'a' (1, 10) (1, 11)
339 OP '=' (1, 11) (1, 12)
340 NUMBER '1' (1, 12) (1, 13)
341 OP ',' (1, 13) (1, 14)
342 OP '*' (1, 15) (1, 16)
343 NAME 'k' (1, 16) (1, 17)
344 OP ',' (1, 17) (1, 18)
345 OP '**' (1, 19) (1, 21)
346 NAME 'w' (1, 21) (1, 22)
347 OP ')' (1, 22) (1, 23)
348 OP ':' (1, 23) (1, 24)
349 NAME 'pass' (1, 25) (1, 29)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300350 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000351
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300352 def test_comparison(self):
353 # Comparison
354 self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != "
355 "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000356 NAME 'if' (1, 0) (1, 2)
357 NUMBER '1' (1, 3) (1, 4)
358 OP '<' (1, 5) (1, 6)
359 NUMBER '1' (1, 7) (1, 8)
360 OP '>' (1, 9) (1, 10)
361 NUMBER '1' (1, 11) (1, 12)
362 OP '==' (1, 13) (1, 15)
363 NUMBER '1' (1, 16) (1, 17)
364 OP '>=' (1, 18) (1, 20)
365 NUMBER '5' (1, 21) (1, 22)
366 OP '<=' (1, 23) (1, 25)
367 NUMBER '0x15' (1, 26) (1, 30)
368 OP '<=' (1, 31) (1, 33)
369 NUMBER '0x12' (1, 34) (1, 38)
370 OP '!=' (1, 39) (1, 41)
371 NUMBER '1' (1, 42) (1, 43)
372 NAME 'and' (1, 44) (1, 47)
373 NUMBER '5' (1, 48) (1, 49)
374 NAME 'in' (1, 50) (1, 52)
375 NUMBER '1' (1, 53) (1, 54)
376 NAME 'not' (1, 55) (1, 58)
377 NAME 'in' (1, 59) (1, 61)
378 NUMBER '1' (1, 62) (1, 63)
379 NAME 'is' (1, 64) (1, 66)
380 NUMBER '1' (1, 67) (1, 68)
381 NAME 'or' (1, 69) (1, 71)
382 NUMBER '5' (1, 72) (1, 73)
383 NAME 'is' (1, 74) (1, 76)
384 NAME 'not' (1, 77) (1, 80)
385 NUMBER '1' (1, 81) (1, 82)
386 OP ':' (1, 82) (1, 83)
387 NAME 'pass' (1, 84) (1, 88)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300388 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000389
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300390 def test_shift(self):
391 # Shift
392 self.check_tokenize("x = 1 << 1 >> 5", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000393 NAME 'x' (1, 0) (1, 1)
394 OP '=' (1, 2) (1, 3)
395 NUMBER '1' (1, 4) (1, 5)
396 OP '<<' (1, 6) (1, 8)
397 NUMBER '1' (1, 9) (1, 10)
398 OP '>>' (1, 11) (1, 13)
399 NUMBER '5' (1, 14) (1, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300400 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000401
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300402 def test_additive(self):
403 # Additive
404 self.check_tokenize("x = 1 - y + 15 - 1 + 0x124 + z + a[5]", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000405 NAME 'x' (1, 0) (1, 1)
406 OP '=' (1, 2) (1, 3)
407 NUMBER '1' (1, 4) (1, 5)
408 OP '-' (1, 6) (1, 7)
409 NAME 'y' (1, 8) (1, 9)
410 OP '+' (1, 10) (1, 11)
411 NUMBER '15' (1, 12) (1, 14)
412 OP '-' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000413 NUMBER '1' (1, 17) (1, 18)
414 OP '+' (1, 19) (1, 20)
415 NUMBER '0x124' (1, 21) (1, 26)
416 OP '+' (1, 27) (1, 28)
417 NAME 'z' (1, 29) (1, 30)
418 OP '+' (1, 31) (1, 32)
419 NAME 'a' (1, 33) (1, 34)
420 OP '[' (1, 34) (1, 35)
421 NUMBER '5' (1, 35) (1, 36)
422 OP ']' (1, 36) (1, 37)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300423 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000424
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300425 def test_multiplicative(self):
426 # Multiplicative
427 self.check_tokenize("x = 1//1*1/5*12%0x12", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000428 NAME 'x' (1, 0) (1, 1)
429 OP '=' (1, 2) (1, 3)
430 NUMBER '1' (1, 4) (1, 5)
431 OP '//' (1, 5) (1, 7)
432 NUMBER '1' (1, 7) (1, 8)
433 OP '*' (1, 8) (1, 9)
434 NUMBER '1' (1, 9) (1, 10)
435 OP '/' (1, 10) (1, 11)
436 NUMBER '5' (1, 11) (1, 12)
437 OP '*' (1, 12) (1, 13)
438 NUMBER '12' (1, 13) (1, 15)
439 OP '%' (1, 15) (1, 16)
440 NUMBER '0x12' (1, 16) (1, 20)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300441 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000442
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300443 def test_unary(self):
444 # Unary
445 self.check_tokenize("~1 ^ 1 & 1 |1 ^ -1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000446 OP '~' (1, 0) (1, 1)
447 NUMBER '1' (1, 1) (1, 2)
448 OP '^' (1, 3) (1, 4)
449 NUMBER '1' (1, 5) (1, 6)
450 OP '&' (1, 7) (1, 8)
451 NUMBER '1' (1, 9) (1, 10)
452 OP '|' (1, 11) (1, 12)
453 NUMBER '1' (1, 12) (1, 13)
454 OP '^' (1, 14) (1, 15)
455 OP '-' (1, 16) (1, 17)
456 NUMBER '1' (1, 17) (1, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300457 """)
458 self.check_tokenize("-1*1/1+1*1//1 - ---1**1", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000459 OP '-' (1, 0) (1, 1)
460 NUMBER '1' (1, 1) (1, 2)
461 OP '*' (1, 2) (1, 3)
462 NUMBER '1' (1, 3) (1, 4)
463 OP '/' (1, 4) (1, 5)
464 NUMBER '1' (1, 5) (1, 6)
465 OP '+' (1, 6) (1, 7)
466 NUMBER '1' (1, 7) (1, 8)
467 OP '*' (1, 8) (1, 9)
468 NUMBER '1' (1, 9) (1, 10)
469 OP '//' (1, 10) (1, 12)
470 NUMBER '1' (1, 12) (1, 13)
471 OP '-' (1, 14) (1, 15)
472 OP '-' (1, 16) (1, 17)
473 OP '-' (1, 17) (1, 18)
474 OP '-' (1, 18) (1, 19)
475 NUMBER '1' (1, 19) (1, 20)
476 OP '**' (1, 20) (1, 22)
477 NUMBER '1' (1, 22) (1, 23)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300478 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000479
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300480 def test_selector(self):
481 # Selector
482 self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000483 NAME 'import' (1, 0) (1, 6)
484 NAME 'sys' (1, 7) (1, 10)
485 OP ',' (1, 10) (1, 11)
486 NAME 'time' (1, 12) (1, 16)
487 NEWLINE '\\n' (1, 16) (1, 17)
488 NAME 'x' (2, 0) (2, 1)
489 OP '=' (2, 2) (2, 3)
490 NAME 'sys' (2, 4) (2, 7)
491 OP '.' (2, 7) (2, 8)
492 NAME 'modules' (2, 8) (2, 15)
493 OP '[' (2, 15) (2, 16)
494 STRING "'time'" (2, 16) (2, 22)
495 OP ']' (2, 22) (2, 23)
496 OP '.' (2, 23) (2, 24)
497 NAME 'time' (2, 24) (2, 28)
498 OP '(' (2, 28) (2, 29)
499 OP ')' (2, 29) (2, 30)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300500 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000501
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300502 def test_method(self):
503 # Methods
504 self.check_tokenize("@staticmethod\ndef foo(x,y): pass", """\
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000505 OP '@' (1, 0) (1, 1)
506 NAME 'staticmethod (1, 1) (1, 13)
507 NEWLINE '\\n' (1, 13) (1, 14)
508 NAME 'def' (2, 0) (2, 3)
509 NAME 'foo' (2, 4) (2, 7)
510 OP '(' (2, 7) (2, 8)
511 NAME 'x' (2, 8) (2, 9)
512 OP ',' (2, 9) (2, 10)
513 NAME 'y' (2, 10) (2, 11)
514 OP ')' (2, 11) (2, 12)
515 OP ':' (2, 12) (2, 13)
516 NAME 'pass' (2, 14) (2, 18)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300517 """)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000518
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300519 def test_tabs(self):
520 # Evil tabs
521 self.check_tokenize("def f():\n"
522 "\tif x\n"
523 " \tpass", """\
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000524 NAME 'def' (1, 0) (1, 3)
525 NAME 'f' (1, 4) (1, 5)
526 OP '(' (1, 5) (1, 6)
527 OP ')' (1, 6) (1, 7)
528 OP ':' (1, 7) (1, 8)
529 NEWLINE '\\n' (1, 8) (1, 9)
530 INDENT '\\t' (2, 0) (2, 1)
531 NAME 'if' (2, 1) (2, 3)
532 NAME 'x' (2, 4) (2, 5)
533 NEWLINE '\\n' (2, 5) (2, 6)
534 INDENT ' \\t' (3, 0) (3, 9)
535 NAME 'pass' (3, 9) (3, 13)
536 DEDENT '' (4, 0) (4, 0)
537 DEDENT '' (4, 0) (4, 0)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300538 """)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000539
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300540 def test_non_ascii_identifiers(self):
541 # Non-ascii identifiers
542 self.check_tokenize("Örter = 'places'\ngrün = 'green'", """\
Benjamin Peterson33856de2010-08-30 14:41:20 +0000543 NAME 'Örter' (1, 0) (1, 5)
544 OP '=' (1, 6) (1, 7)
545 STRING "'places'" (1, 8) (1, 16)
546 NEWLINE '\\n' (1, 16) (1, 17)
547 NAME 'grün' (2, 0) (2, 4)
548 OP '=' (2, 5) (2, 6)
549 STRING "'green'" (2, 7) (2, 14)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300550 """)
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000551
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300552 def test_unicode(self):
553 # Legacy unicode literals:
554 self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000555 NAME 'Örter' (1, 0) (1, 5)
556 OP '=' (1, 6) (1, 7)
557 STRING "u'places'" (1, 8) (1, 17)
558 NEWLINE '\\n' (1, 17) (1, 18)
559 NAME 'grün' (2, 0) (2, 4)
560 OP '=' (2, 5) (2, 6)
Christian Heimes0b3847d2012-06-20 11:17:58 +0200561 STRING "U'green'" (2, 7) (2, 15)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300562 """)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000563
Raymond Hettinger68c04532005-06-10 11:05:19 +0000564
Raymond Hettinger68c04532005-06-10 11:05:19 +0000565def decistmt(s):
Raymond Hettinger68c04532005-06-10 11:05:19 +0000566 result = []
Trent Nelson428de652008-03-18 22:41:35 +0000567 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
Raymond Hettinger68c04532005-06-10 11:05:19 +0000568 for toknum, tokval, _, _, _ in g:
569 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
570 result.extend([
571 (NAME, 'Decimal'),
572 (OP, '('),
573 (STRING, repr(tokval)),
574 (OP, ')')
575 ])
576 else:
577 result.append((toknum, tokval))
Trent Nelson428de652008-03-18 22:41:35 +0000578 return untokenize(result).decode('utf-8')
579
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300580class TestMisc(TestCase):
581
582 def test_decistmt(self):
583 # Substitute Decimals for floats in a string of statements.
584 # This is an example from the docs.
585
586 from decimal import Decimal
587 s = '+21.3e-5*-.1234/81.7'
588 self.assertEqual(decistmt(s),
589 "+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')")
590
591 # The format of the exponent is inherited from the platform C library.
592 # Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
593 # we're only showing 11 digits, and the 12th isn't close to 5, the
594 # rest of the output should be platform-independent.
595 self.assertRegex(repr(eval(s)), '-3.2171603427[0-9]*e-0+7')
596
597 # Output from calculations with Decimal should be identical across all
598 # platforms.
599 self.assertEqual(eval(decistmt(s)),
600 Decimal('-3.217160342717258261933904529E-7'))
601
Trent Nelson428de652008-03-18 22:41:35 +0000602
603class TestTokenizerAdheresToPep0263(TestCase):
604 """
605 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
606 """
607
608 def _testFile(self, filename):
609 path = os.path.join(os.path.dirname(__file__), filename)
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300610 TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
Trent Nelson428de652008-03-18 22:41:35 +0000611
612 def test_utf8_coding_cookie_and_no_utf8_bom(self):
Ned Deily2ea6fcc2011-07-19 16:15:27 -0700613 f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300614 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +0000615
616 def test_latin1_coding_cookie_and_utf8_bom(self):
617 """
618 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
619 allowed encoding for the comment is 'utf-8'. The text file used in
620 this test starts with a BOM signature, but specifies latin1 as the
621 coding, so verify that a SyntaxError is raised, which matches the
622 behaviour of the interpreter when it encounters a similar condition.
623 """
624 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000625 self.assertRaises(SyntaxError, self._testFile, f)
Trent Nelson428de652008-03-18 22:41:35 +0000626
627 def test_no_coding_cookie_and_utf8_bom(self):
628 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300629 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +0000630
631 def test_utf8_coding_cookie_and_utf8_bom(self):
632 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +0300633 self._testFile(f)
Trent Nelson428de652008-03-18 22:41:35 +0000634
Florent Xicluna11f0b412012-07-07 12:13:35 +0200635 def test_bad_coding_cookie(self):
636 self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')
637 self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')
638
Trent Nelson428de652008-03-18 22:41:35 +0000639
640class Test_Tokenize(TestCase):
641
642 def test__tokenize_decodes_with_specified_encoding(self):
643 literal = '"ЉЊЈЁЂ"'
644 line = literal.encode('utf-8')
645 first = False
646 def readline():
647 nonlocal first
648 if not first:
649 first = True
650 return line
651 else:
652 return b''
653
654 # skip the initial encoding token and the end token
655 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
656 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000657 self.assertEqual(tokens, expected_tokens,
658 "bytes not decoded with encoding")
Trent Nelson428de652008-03-18 22:41:35 +0000659
660 def test__tokenize_does_not_decode_with_encoding_none(self):
661 literal = '"ЉЊЈЁЂ"'
662 first = False
663 def readline():
664 nonlocal first
665 if not first:
666 first = True
667 return literal
668 else:
669 return b''
670
671 # skip the end token
672 tokens = list(_tokenize(readline, encoding=None))[:-1]
673 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000674 self.assertEqual(tokens, expected_tokens,
675 "string not tokenized when encoding is None")
Trent Nelson428de652008-03-18 22:41:35 +0000676
677
678class TestDetectEncoding(TestCase):
679
680 def get_readline(self, lines):
681 index = 0
682 def readline():
683 nonlocal index
684 if index == len(lines):
685 raise StopIteration
686 line = lines[index]
687 index += 1
688 return line
689 return readline
690
691 def test_no_bom_no_encoding_cookie(self):
692 lines = (
693 b'# something\n',
694 b'print(something)\n',
695 b'do_something(else)\n'
696 )
697 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000698 self.assertEqual(encoding, 'utf-8')
699 self.assertEqual(consumed_lines, list(lines[:2]))
Trent Nelson428de652008-03-18 22:41:35 +0000700
701 def test_bom_no_cookie(self):
702 lines = (
703 b'\xef\xbb\xbf# something\n',
704 b'print(something)\n',
705 b'do_something(else)\n'
706 )
707 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000708 self.assertEqual(encoding, 'utf-8-sig')
709 self.assertEqual(consumed_lines,
710 [b'# something\n', b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000711
712 def test_cookie_first_line_no_bom(self):
713 lines = (
714 b'# -*- coding: latin-1 -*-\n',
715 b'print(something)\n',
716 b'do_something(else)\n'
717 )
718 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000719 self.assertEqual(encoding, 'iso-8859-1')
720 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000721
722 def test_matched_bom_and_cookie_first_line(self):
723 lines = (
724 b'\xef\xbb\xbf# coding=utf-8\n',
725 b'print(something)\n',
726 b'do_something(else)\n'
727 )
728 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000729 self.assertEqual(encoding, 'utf-8-sig')
730 self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000731
732 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
733 lines = (
734 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
735 b'print(something)\n',
736 b'do_something(else)\n'
737 )
738 readline = self.get_readline(lines)
739 self.assertRaises(SyntaxError, detect_encoding, readline)
740
741 def test_cookie_second_line_no_bom(self):
742 lines = (
743 b'#! something\n',
744 b'# vim: set fileencoding=ascii :\n',
745 b'print(something)\n',
746 b'do_something(else)\n'
747 )
748 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000749 self.assertEqual(encoding, 'ascii')
Trent Nelson428de652008-03-18 22:41:35 +0000750 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
Ezio Melottib3aedd42010-11-20 19:04:17 +0000751 self.assertEqual(consumed_lines, expected)
Trent Nelson428de652008-03-18 22:41:35 +0000752
753 def test_matched_bom_and_cookie_second_line(self):
754 lines = (
755 b'\xef\xbb\xbf#! something\n',
756 b'f# coding=utf-8\n',
757 b'print(something)\n',
758 b'do_something(else)\n'
759 )
760 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000761 self.assertEqual(encoding, 'utf-8-sig')
762 self.assertEqual(consumed_lines,
763 [b'#! something\n', b'f# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000764
765 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
766 lines = (
767 b'\xef\xbb\xbf#! something\n',
768 b'# vim: set fileencoding=ascii :\n',
769 b'print(something)\n',
770 b'do_something(else)\n'
771 )
772 readline = self.get_readline(lines)
773 self.assertRaises(SyntaxError, detect_encoding, readline)
774
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200775 def test_cookie_second_line_noncommented_first_line(self):
776 lines = (
777 b"print('\xc2\xa3')\n",
778 b'# vim: set fileencoding=iso8859-15 :\n',
779 b"print('\xe2\x82\xac')\n"
780 )
781 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
782 self.assertEqual(encoding, 'utf-8')
783 expected = [b"print('\xc2\xa3')\n"]
784 self.assertEqual(consumed_lines, expected)
785
786 def test_cookie_second_line_commented_first_line(self):
787 lines = (
788 b"#print('\xc2\xa3')\n",
789 b'# vim: set fileencoding=iso8859-15 :\n',
790 b"print('\xe2\x82\xac')\n"
791 )
792 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
793 self.assertEqual(encoding, 'iso8859-15')
794 expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
795 self.assertEqual(consumed_lines, expected)
796
797 def test_cookie_second_line_empty_first_line(self):
798 lines = (
799 b'\n',
800 b'# vim: set fileencoding=iso8859-15 :\n',
801 b"print('\xe2\x82\xac')\n"
802 )
803 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
804 self.assertEqual(encoding, 'iso8859-15')
805 expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
806 self.assertEqual(consumed_lines, expected)
807
Benjamin Petersond3afada2009-10-09 21:43:09 +0000808 def test_latin1_normalization(self):
809 # See get_normal_name() in tokenizer.c.
810 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
811 "iso-8859-1-unix", "iso-latin-1-mac")
812 for encoding in encodings:
813 for rep in ("-", "_"):
814 enc = encoding.replace("-", rep)
815 lines = (b"#!/usr/bin/python\n",
816 b"# coding: " + enc.encode("ascii") + b"\n",
817 b"print(things)\n",
818 b"do_something += 4\n")
819 rl = self.get_readline(lines)
820 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000821 self.assertEqual(found, "iso-8859-1")
Benjamin Petersond3afada2009-10-09 21:43:09 +0000822
Martin v. Löwis63674f42012-04-20 14:36:47 +0200823 def test_syntaxerror_latin1(self):
824 # Issue 14629: need to raise SyntaxError if the first
825 # line(s) have non-UTF-8 characters
826 lines = (
827 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
828 )
829 readline = self.get_readline(lines)
830 self.assertRaises(SyntaxError, detect_encoding, readline)
831
832
Benjamin Petersond3afada2009-10-09 21:43:09 +0000833 def test_utf8_normalization(self):
834 # See get_normal_name() in tokenizer.c.
835 encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
836 for encoding in encodings:
837 for rep in ("-", "_"):
838 enc = encoding.replace("-", rep)
839 lines = (b"#!/usr/bin/python\n",
840 b"# coding: " + enc.encode("ascii") + b"\n",
841 b"1 + 3\n")
842 rl = self.get_readline(lines)
843 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000844 self.assertEqual(found, "utf-8")
Benjamin Petersond3afada2009-10-09 21:43:09 +0000845
Trent Nelson428de652008-03-18 22:41:35 +0000846 def test_short_files(self):
847 readline = self.get_readline((b'print(something)\n',))
848 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000849 self.assertEqual(encoding, 'utf-8')
850 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000851
852 encoding, consumed_lines = detect_encoding(self.get_readline(()))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000853 self.assertEqual(encoding, 'utf-8')
854 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +0000855
856 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
857 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000858 self.assertEqual(encoding, 'utf-8-sig')
859 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000860
861 readline = self.get_readline((b'\xef\xbb\xbf',))
862 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000863 self.assertEqual(encoding, 'utf-8-sig')
864 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +0000865
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000866 readline = self.get_readline((b'# coding: bad\n',))
867 self.assertRaises(SyntaxError, detect_encoding, readline)
Trent Nelson428de652008-03-18 22:41:35 +0000868
Serhiy Storchakadafea852013-09-16 23:51:56 +0300869 def test_false_encoding(self):
870 # Issue 18873: "Encoding" detected in non-comment lines
871 readline = self.get_readline((b'print("#coding=fake")',))
872 encoding, consumed_lines = detect_encoding(readline)
873 self.assertEqual(encoding, 'utf-8')
874 self.assertEqual(consumed_lines, [b'print("#coding=fake")'])
875
Victor Stinner58c07522010-11-09 01:08:59 +0000876 def test_open(self):
877 filename = support.TESTFN + '.py'
878 self.addCleanup(support.unlink, filename)
879
880 # test coding cookie
881 for encoding in ('iso-8859-15', 'utf-8'):
882 with open(filename, 'w', encoding=encoding) as fp:
883 print("# coding: %s" % encoding, file=fp)
884 print("print('euro:\u20ac')", file=fp)
885 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +0000886 self.assertEqual(fp.encoding, encoding)
887 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +0000888
889 # test BOM (no coding cookie)
890 with open(filename, 'w', encoding='utf-8-sig') as fp:
891 print("print('euro:\u20ac')", file=fp)
892 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +0000893 self.assertEqual(fp.encoding, 'utf-8-sig')
894 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +0000895
Brett Cannonc33f3f22012-04-20 13:23:54 -0400896 def test_filename_in_exception(self):
897 # When possible, include the file name in the exception.
898 path = 'some_file_path'
899 lines = (
900 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
901 )
902 class Bunk:
903 def __init__(self, lines, path):
904 self.name = path
905 self._lines = lines
906 self._index = 0
907
908 def readline(self):
909 if self._index == len(lines):
910 raise StopIteration
911 line = lines[self._index]
912 self._index += 1
913 return line
914
915 with self.assertRaises(SyntaxError):
916 ins = Bunk(lines, path)
917 # Make sure lacking a name isn't an issue.
918 del ins.name
919 detect_encoding(ins.readline)
920 with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
921 ins = Bunk(lines, path)
922 detect_encoding(ins.readline)
923
Victor Stinner387729e2015-05-26 00:43:58 +0200924 def test_open_error(self):
925 # Issue #23840: open() must close the binary file on error
926 m = BytesIO(b'#coding:xxx')
927 with mock.patch('tokenize._builtin_open', return_value=m):
928 self.assertRaises(SyntaxError, tokenize_open, 'foobar')
929 self.assertTrue(m.closed)
930
931
Trent Nelson428de652008-03-18 22:41:35 +0000932class TestTokenize(TestCase):
933
934 def test_tokenize(self):
935 import tokenize as tokenize_module
936 encoding = object()
937 encoding_used = None
938 def mock_detect_encoding(readline):
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +0200939 return encoding, [b'first', b'second']
Trent Nelson428de652008-03-18 22:41:35 +0000940
941 def mock__tokenize(readline, encoding):
942 nonlocal encoding_used
943 encoding_used = encoding
944 out = []
945 while True:
946 next_line = readline()
947 if next_line:
948 out.append(next_line)
949 continue
950 return out
951
952 counter = 0
953 def mock_readline():
954 nonlocal counter
955 counter += 1
956 if counter == 5:
957 return b''
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +0200958 return str(counter).encode()
Trent Nelson428de652008-03-18 22:41:35 +0000959
960 orig_detect_encoding = tokenize_module.detect_encoding
961 orig__tokenize = tokenize_module._tokenize
962 tokenize_module.detect_encoding = mock_detect_encoding
963 tokenize_module._tokenize = mock__tokenize
964 try:
965 results = tokenize(mock_readline)
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +0200966 self.assertEqual(list(results),
967 [b'first', b'second', b'1', b'2', b'3', b'4'])
Trent Nelson428de652008-03-18 22:41:35 +0000968 finally:
969 tokenize_module.detect_encoding = orig_detect_encoding
970 tokenize_module._tokenize = orig__tokenize
971
972 self.assertTrue(encoding_used, encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000973
Meador Inge00c7f852012-01-19 00:44:45 -0600974 def assertExactTypeEqual(self, opstr, *optypes):
975 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
976 num_optypes = len(optypes)
977 self.assertEqual(len(tokens), 2 + num_optypes)
978 self.assertEqual(token.tok_name[tokens[0].exact_type],
979 token.tok_name[ENCODING])
980 for i in range(num_optypes):
981 self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
982 token.tok_name[optypes[i]])
983 self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
984 token.tok_name[token.ENDMARKER])
985
986 def test_exact_type(self):
987 self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
988 self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
989 self.assertExactTypeEqual(':', token.COLON)
990 self.assertExactTypeEqual(',', token.COMMA)
991 self.assertExactTypeEqual(';', token.SEMI)
992 self.assertExactTypeEqual('+', token.PLUS)
993 self.assertExactTypeEqual('-', token.MINUS)
994 self.assertExactTypeEqual('*', token.STAR)
995 self.assertExactTypeEqual('/', token.SLASH)
996 self.assertExactTypeEqual('|', token.VBAR)
997 self.assertExactTypeEqual('&', token.AMPER)
998 self.assertExactTypeEqual('<', token.LESS)
999 self.assertExactTypeEqual('>', token.GREATER)
1000 self.assertExactTypeEqual('=', token.EQUAL)
1001 self.assertExactTypeEqual('.', token.DOT)
1002 self.assertExactTypeEqual('%', token.PERCENT)
1003 self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
1004 self.assertExactTypeEqual('==', token.EQEQUAL)
1005 self.assertExactTypeEqual('!=', token.NOTEQUAL)
1006 self.assertExactTypeEqual('<=', token.LESSEQUAL)
1007 self.assertExactTypeEqual('>=', token.GREATEREQUAL)
1008 self.assertExactTypeEqual('~', token.TILDE)
1009 self.assertExactTypeEqual('^', token.CIRCUMFLEX)
1010 self.assertExactTypeEqual('<<', token.LEFTSHIFT)
1011 self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
1012 self.assertExactTypeEqual('**', token.DOUBLESTAR)
1013 self.assertExactTypeEqual('+=', token.PLUSEQUAL)
1014 self.assertExactTypeEqual('-=', token.MINEQUAL)
1015 self.assertExactTypeEqual('*=', token.STAREQUAL)
1016 self.assertExactTypeEqual('/=', token.SLASHEQUAL)
1017 self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
1018 self.assertExactTypeEqual('&=', token.AMPEREQUAL)
1019 self.assertExactTypeEqual('|=', token.VBAREQUAL)
1020 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1021 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1022 self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
1023 self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
1024 self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
1025 self.assertExactTypeEqual('//', token.DOUBLESLASH)
1026 self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
1027 self.assertExactTypeEqual('@', token.AT)
1028
1029 self.assertExactTypeEqual('a**2+b**2==c**2',
1030 NAME, token.DOUBLESTAR, NUMBER,
1031 token.PLUS,
1032 NAME, token.DOUBLESTAR, NUMBER,
1033 token.EQEQUAL,
1034 NAME, token.DOUBLESTAR, NUMBER)
1035 self.assertExactTypeEqual('{1, 2, 3}',
1036 token.LBRACE,
1037 token.NUMBER, token.COMMA,
1038 token.NUMBER, token.COMMA,
1039 token.NUMBER,
1040 token.RBRACE)
1041 self.assertExactTypeEqual('^(x & 0x1)',
1042 token.CIRCUMFLEX,
1043 token.LPAR,
1044 token.NAME, token.AMPER, token.NUMBER,
1045 token.RPAR)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001046
Ezio Melottifafa8b72012-11-03 17:46:51 +02001047 def test_pathological_trailing_whitespace(self):
1048 # See http://bugs.python.org/issue16152
1049 self.assertExactTypeEqual('@ ', token.AT)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001050
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001051
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001052class UntokenizeTest(TestCase):
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001053
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001054 def test_bad_input_order(self):
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001055 # raise if previous row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001056 u = Untokenizer()
1057 u.prev_row = 2
1058 u.prev_col = 2
1059 with self.assertRaises(ValueError) as cm:
1060 u.add_whitespace((1,3))
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001061 self.assertEqual(cm.exception.args[0],
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001062 'start (1,3) precedes previous end (2,2)')
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001063 # raise if previous column in row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001064 self.assertRaises(ValueError, u.add_whitespace, (2,1))
1065
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001066 def test_backslash_continuation(self):
1067 # The problem is that <whitespace>\<newline> leaves no token
1068 u = Untokenizer()
1069 u.prev_row = 1
1070 u.prev_col = 1
1071 u.tokens = []
1072 u.add_whitespace((2, 0))
1073 self.assertEqual(u.tokens, ['\\\n'])
1074 u.prev_row = 2
1075 u.add_whitespace((4, 4))
1076 self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' '])
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001077 TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n')
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001078
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -05001079 def test_iter_compat(self):
1080 u = Untokenizer()
1081 token = (NAME, 'Hello')
1082 tokens = [(ENCODING, 'utf-8'), token]
1083 u.compat(token, iter([]))
1084 self.assertEqual(u.tokens, ["Hello "])
1085 u = Untokenizer()
1086 self.assertEqual(u.untokenize(iter([token])), 'Hello ')
1087 u = Untokenizer()
1088 self.assertEqual(u.untokenize(iter(tokens)), 'Hello ')
1089 self.assertEqual(u.encoding, 'utf-8')
1090 self.assertEqual(untokenize(iter(tokens)), b'Hello ')
1091
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001092
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001093class TestRoundtrip(TestCase):
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001094
1095 def check_roundtrip(self, f):
1096 """
1097 Test roundtrip for `untokenize`. `f` is an open file or a string.
1098 The source code in f is tokenized to both 5- and 2-tuples.
1099 Both sequences are converted back to source code via
1100 tokenize.untokenize(), and the latter tokenized again to 2-tuples.
1101 The test fails if the 3 pair tokenizations do not match.
1102
1103 When untokenize bugs are fixed, untokenize with 5-tuples should
1104 reproduce code that does not contain a backslash continuation
1105 following spaces. A proper test should test this.
1106 """
1107 # Get source code and original tokenizations
1108 if isinstance(f, str):
1109 code = f.encode('utf-8')
1110 else:
1111 code = f.read()
1112 f.close()
1113 readline = iter(code.splitlines(keepends=True)).__next__
1114 tokens5 = list(tokenize(readline))
1115 tokens2 = [tok[:2] for tok in tokens5]
1116 # Reproduce tokens2 from pairs
1117 bytes_from2 = untokenize(tokens2)
1118 readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__
1119 tokens2_from2 = [tok[:2] for tok in tokenize(readline2)]
1120 self.assertEqual(tokens2_from2, tokens2)
1121 # Reproduce tokens2 from 5-tuples
1122 bytes_from5 = untokenize(tokens5)
1123 readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__
1124 tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
1125 self.assertEqual(tokens2_from5, tokens2)
1126
1127 def test_roundtrip(self):
1128 # There are some standard formatting practices that are easy to get right.
1129
1130 self.check_roundtrip("if x == 1:\n"
1131 " print(x)\n")
1132 self.check_roundtrip("# This is a comment\n"
1133 "# This also")
1134
1135 # Some people use different formatting conventions, which makes
1136 # untokenize a little trickier. Note that this test involves trailing
1137 # whitespace after the colon. Note that we use hex escapes to make the
1138 # two trailing blanks apparent in the expected output.
1139
1140 self.check_roundtrip("if x == 1 : \n"
1141 " print(x)\n")
1142 fn = support.findfile("tokenize_tests.txt")
1143 with open(fn, 'rb') as f:
1144 self.check_roundtrip(f)
1145 self.check_roundtrip("if x == 1:\n"
1146 " # A comment by itself.\n"
1147 " print(x) # Comment here, too.\n"
1148 " # Another comment.\n"
1149 "after_if = True\n")
1150 self.check_roundtrip("if (x # The comments need to go in the right place\n"
1151 " == 1):\n"
1152 " print('x==1')\n")
1153 self.check_roundtrip("class Test: # A comment here\n"
1154 " # A comment with weird indent\n"
1155 " after_com = 5\n"
1156 " def x(m): return m*5 # a one liner\n"
1157 " def y(m): # A whitespace after the colon\n"
1158 " return y*4 # 3-space indent\n")
1159
1160 # Some error-handling code
1161 self.check_roundtrip("try: import somemodule\n"
1162 "except ImportError: # comment\n"
1163 " print('Can not import' # comment2\n)"
1164 "else: print('Loaded')\n")
1165
1166 def test_continuation(self):
1167 # Balancing continuation
1168 self.check_roundtrip("a = (3,4, \n"
1169 "5,6)\n"
1170 "y = [3, 4,\n"
1171 "5]\n"
1172 "z = {'a': 5,\n"
1173 "'b':15, 'c':True}\n"
1174 "x = len(y) + 5 - a[\n"
1175 "3] - a[2]\n"
1176 "+ len(z) - z[\n"
1177 "'b']\n")
1178
1179 def test_backslash_continuation(self):
1180 # Backslash means line continuation, except for comments
1181 self.check_roundtrip("x=1+\\\n"
1182 "1\n"
1183 "# This is a comment\\\n"
1184 "# This also\n")
1185 self.check_roundtrip("# Comment \\\n"
1186 "x = 0")
1187
1188 def test_string_concatenation(self):
1189 # Two string literals on the same line
1190 self.check_roundtrip("'' ''")
1191
1192 def test_random_files(self):
1193 # Test roundtrip on random python modules.
1194 # pass the '-ucpu' option to process the full directory.
1195
1196 import glob, random
1197 fn = support.findfile("tokenize_tests.txt")
1198 tempdir = os.path.dirname(fn) or os.curdir
1199 testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
1200
1201 # Tokenize is broken on test_pep3131.py because regular expressions are
1202 # broken on the obscure unicode identifiers in it. *sigh*
1203 # With roundtrip extended to test the 5-tuple mode of untokenize,
1204 # 7 more testfiles fail. Remove them also until the failure is diagnosed.
1205
1206 testfiles.remove(os.path.join(tempdir, "test_pep3131.py"))
1207 for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'):
1208 testfiles.remove(os.path.join(tempdir, "test_%s.py") % f)
1209
1210 if not support.is_resource_enabled("cpu"):
1211 testfiles = random.sample(testfiles, 10)
1212
1213 for testfile in testfiles:
1214 with open(testfile, 'rb') as f:
1215 with self.subTest(file=testfile):
1216 self.check_roundtrip(f)
1217
1218
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001219 def roundtrip(self, code):
1220 if isinstance(code, str):
1221 code = code.encode('utf-8')
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001222 return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001223
1224 def test_indentation_semantics_retained(self):
1225 """
1226 Ensure that although whitespace might be mutated in a roundtrip,
1227 the semantic meaning of the indentation remains consistent.
1228 """
1229 code = "if False:\n\tx=3\n\tx=3\n"
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001230 codelines = self.roundtrip(code).split('\n')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001231 self.assertEqual(codelines[1], codelines[2])
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001232 self.check_roundtrip(code)
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001233
1234
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001235if __name__ == "__main__":
Serhiy Storchaka5f6fa822015-10-06 18:16:28 +03001236 unittest.main()