blob: b4e114c84407435d6d18363802fc70bb57595318 [file] [log] [blame]
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001doctests = """
2Tests for the tokenize module.
Thomas Wouters89f507f2006-12-13 04:49:30 +00003
Christian Heimesdd15f6c2008-03-16 00:07:10 +00004The tests can be really simple. Given a small fragment of source
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05005code, print out a table with tokens. The ENDMARKER is omitted for
Thomas Wouters89f507f2006-12-13 04:49:30 +00006brevity.
7
Christian Heimesdd15f6c2008-03-16 00:07:10 +00008 >>> dump_tokens("1 + 1")
Trent Nelson428de652008-03-18 22:41:35 +00009 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010 NUMBER '1' (1, 0) (1, 1)
11 OP '+' (1, 2) (1, 3)
12 NUMBER '1' (1, 4) (1, 5)
Thomas Wouters89f507f2006-12-13 04:49:30 +000013
Christian Heimesdd15f6c2008-03-16 00:07:10 +000014 >>> dump_tokens("if False:\\n"
15 ... " # NL\\n"
16 ... " True = False # NEWLINE\\n")
Trent Nelson428de652008-03-18 22:41:35 +000017 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000018 NAME 'if' (1, 0) (1, 2)
19 NAME 'False' (1, 3) (1, 8)
20 OP ':' (1, 8) (1, 9)
21 NEWLINE '\\n' (1, 9) (1, 10)
22 COMMENT '# NL' (2, 4) (2, 8)
23 NL '\\n' (2, 8) (2, 9)
24 INDENT ' ' (3, 0) (3, 4)
25 NAME 'True' (3, 4) (3, 8)
26 OP '=' (3, 9) (3, 10)
27 NAME 'False' (3, 11) (3, 16)
28 COMMENT '# NEWLINE' (3, 17) (3, 26)
29 NEWLINE '\\n' (3, 26) (3, 27)
30 DEDENT '' (4, 0) (4, 0)
Thomas Wouters89f507f2006-12-13 04:49:30 +000031
Christian Heimesdd15f6c2008-03-16 00:07:10 +000032 >>> indent_error_file = \"""
33 ... def k(x):
34 ... x += 2
35 ... x += 5
36 ... \"""
Trent Nelson428de652008-03-18 22:41:35 +000037 >>> readline = BytesIO(indent_error_file.encode('utf-8')).readline
38 >>> for tok in tokenize(readline): pass
Christian Heimesdd15f6c2008-03-16 00:07:10 +000039 Traceback (most recent call last):
40 ...
41 IndentationError: unindent does not match any outer indentation level
Thomas Wouters89f507f2006-12-13 04:49:30 +000042
Mark Dickinson3c0b3172010-06-29 07:38:37 +000043There are some standard formatting practices that are easy to get right.
Thomas Wouters89f507f2006-12-13 04:49:30 +000044
Christian Heimesdd15f6c2008-03-16 00:07:10 +000045 >>> roundtrip("if x == 1:\\n"
46 ... " print(x)\\n")
47 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000048
Christian Heimesdd15f6c2008-03-16 00:07:10 +000049 >>> roundtrip("# This is a comment\\n# This also")
50 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000051
52Some people use different formatting conventions, which makes
Christian Heimesdd15f6c2008-03-16 00:07:10 +000053untokenize a little trickier. Note that this test involves trailing
54whitespace after the colon. Note that we use hex escapes to make the
Trent Nelson428de652008-03-18 22:41:35 +000055two trailing blanks apparent in the expected output.
Thomas Wouters89f507f2006-12-13 04:49:30 +000056
Christian Heimesdd15f6c2008-03-16 00:07:10 +000057 >>> roundtrip("if x == 1 : \\n"
58 ... " print(x)\\n")
59 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000060
Benjamin Petersonee8712c2008-05-20 21:35:26 +000061 >>> f = support.findfile("tokenize_tests.txt")
Trent Nelson428de652008-03-18 22:41:35 +000062 >>> roundtrip(open(f, 'rb'))
Christian Heimesdd15f6c2008-03-16 00:07:10 +000063 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000064
Christian Heimesdd15f6c2008-03-16 00:07:10 +000065 >>> roundtrip("if x == 1:\\n"
66 ... " # A comment by itself.\\n"
67 ... " print(x) # Comment here, too.\\n"
68 ... " # Another comment.\\n"
69 ... "after_if = True\\n")
70 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000071
Christian Heimesdd15f6c2008-03-16 00:07:10 +000072 >>> roundtrip("if (x # The comments need to go in the right place\\n"
73 ... " == 1):\\n"
74 ... " print('x==1')\\n")
75 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000076
Christian Heimesdd15f6c2008-03-16 00:07:10 +000077 >>> roundtrip("class Test: # A comment here\\n"
78 ... " # A comment with weird indent\\n"
79 ... " after_com = 5\\n"
80 ... " def x(m): return m*5 # a one liner\\n"
81 ... " def y(m): # A whitespace after the colon\\n"
82 ... " return y*4 # 3-space indent\\n")
83 True
84
85Some error-handling code
86
87 >>> roundtrip("try: import somemodule\\n"
88 ... "except ImportError: # comment\\n"
Christian Heimesba4af492008-03-28 00:55:15 +000089 ... " print('Can not import' # comment2\\n)"
Neal Norwitz752abd02008-05-13 04:55:24 +000090 ... "else: print('Loaded')\\n")
Christian Heimesdd15f6c2008-03-16 00:07:10 +000091 True
92
Eric Smith74ca5572008-03-17 19:49:19 +000093Balancing continuation
Christian Heimesdd15f6c2008-03-16 00:07:10 +000094
95 >>> roundtrip("a = (3,4, \\n"
96 ... "5,6)\\n"
97 ... "y = [3, 4,\\n"
98 ... "5]\\n"
99 ... "z = {'a': 5,\\n"
100 ... "'b':15, 'c':True}\\n"
101 ... "x = len(y) + 5 - a[\\n"
102 ... "3] - a[2]\\n"
103 ... "+ len(z) - z[\\n"
104 ... "'b']\\n")
105 True
106
107Ordinary integers and binary operators
108
109 >>> dump_tokens("0xff <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000110 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000111 NUMBER '0xff' (1, 0) (1, 4)
112 OP '<=' (1, 5) (1, 7)
113 NUMBER '255' (1, 8) (1, 11)
Eric Smith74ca5572008-03-17 19:49:19 +0000114 >>> dump_tokens("0b10 <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000115 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000116 NUMBER '0b10' (1, 0) (1, 4)
117 OP '<=' (1, 5) (1, 7)
118 NUMBER '255' (1, 8) (1, 11)
119 >>> dump_tokens("0o123 <= 0O123")
Trent Nelson428de652008-03-18 22:41:35 +0000120 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000121 NUMBER '0o123' (1, 0) (1, 5)
122 OP '<=' (1, 6) (1, 8)
123 NUMBER '0O123' (1, 9) (1, 14)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000124 >>> dump_tokens("1234567 > ~0x15")
Trent Nelson428de652008-03-18 22:41:35 +0000125 ENCODING 'utf-8' (0, 0) (0, 0)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000126 NUMBER '1234567' (1, 0) (1, 7)
127 OP '>' (1, 8) (1, 9)
128 OP '~' (1, 10) (1, 11)
129 NUMBER '0x15' (1, 11) (1, 15)
130 >>> dump_tokens("2134568 != 1231515")
Trent Nelson428de652008-03-18 22:41:35 +0000131 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000132 NUMBER '2134568' (1, 0) (1, 7)
133 OP '!=' (1, 8) (1, 10)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000134 NUMBER '1231515' (1, 11) (1, 18)
135 >>> dump_tokens("(-124561-1) & 200000000")
Trent Nelson428de652008-03-18 22:41:35 +0000136 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000137 OP '(' (1, 0) (1, 1)
138 OP '-' (1, 1) (1, 2)
139 NUMBER '124561' (1, 2) (1, 8)
140 OP '-' (1, 8) (1, 9)
141 NUMBER '1' (1, 9) (1, 10)
142 OP ')' (1, 10) (1, 11)
143 OP '&' (1, 12) (1, 13)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000144 NUMBER '200000000' (1, 14) (1, 23)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000145 >>> dump_tokens("0xdeadbeef != -1")
Trent Nelson428de652008-03-18 22:41:35 +0000146 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000147 NUMBER '0xdeadbeef' (1, 0) (1, 10)
148 OP '!=' (1, 11) (1, 13)
149 OP '-' (1, 14) (1, 15)
150 NUMBER '1' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000151 >>> dump_tokens("0xdeadc0de & 12345")
Trent Nelson428de652008-03-18 22:41:35 +0000152 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000153 NUMBER '0xdeadc0de' (1, 0) (1, 10)
154 OP '&' (1, 11) (1, 12)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000155 NUMBER '12345' (1, 13) (1, 18)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000156 >>> dump_tokens("0xFF & 0x15 | 1234")
Trent Nelson428de652008-03-18 22:41:35 +0000157 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000158 NUMBER '0xFF' (1, 0) (1, 4)
159 OP '&' (1, 5) (1, 6)
160 NUMBER '0x15' (1, 7) (1, 11)
161 OP '|' (1, 12) (1, 13)
162 NUMBER '1234' (1, 14) (1, 18)
163
164Long integers
165
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000166 >>> dump_tokens("x = 0")
Trent Nelson428de652008-03-18 22:41:35 +0000167 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000168 NAME 'x' (1, 0) (1, 1)
169 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000170 NUMBER '0' (1, 4) (1, 5)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000171 >>> dump_tokens("x = 0xfffffffffff")
Trent Nelson428de652008-03-18 22:41:35 +0000172 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000173 NAME 'x' (1, 0) (1, 1)
174 OP '=' (1, 2) (1, 3)
175 NUMBER '0xffffffffff (1, 4) (1, 17)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000176 >>> dump_tokens("x = 123141242151251616110")
Trent Nelson428de652008-03-18 22:41:35 +0000177 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000178 NAME 'x' (1, 0) (1, 1)
179 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000180 NUMBER '123141242151 (1, 4) (1, 25)
181 >>> dump_tokens("x = -15921590215012591")
Trent Nelson428de652008-03-18 22:41:35 +0000182 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000183 NAME 'x' (1, 0) (1, 1)
184 OP '=' (1, 2) (1, 3)
185 OP '-' (1, 4) (1, 5)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000186 NUMBER '159215902150 (1, 5) (1, 22)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000187
188Floating point numbers
189
190 >>> dump_tokens("x = 3.14159")
Trent Nelson428de652008-03-18 22:41:35 +0000191 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000192 NAME 'x' (1, 0) (1, 1)
193 OP '=' (1, 2) (1, 3)
194 NUMBER '3.14159' (1, 4) (1, 11)
195 >>> dump_tokens("x = 314159.")
Trent Nelson428de652008-03-18 22:41:35 +0000196 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000197 NAME 'x' (1, 0) (1, 1)
198 OP '=' (1, 2) (1, 3)
199 NUMBER '314159.' (1, 4) (1, 11)
200 >>> dump_tokens("x = .314159")
Trent Nelson428de652008-03-18 22:41:35 +0000201 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000202 NAME 'x' (1, 0) (1, 1)
203 OP '=' (1, 2) (1, 3)
204 NUMBER '.314159' (1, 4) (1, 11)
205 >>> dump_tokens("x = 3e14159")
Trent Nelson428de652008-03-18 22:41:35 +0000206 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000207 NAME 'x' (1, 0) (1, 1)
208 OP '=' (1, 2) (1, 3)
209 NUMBER '3e14159' (1, 4) (1, 11)
210 >>> dump_tokens("x = 3E123")
Trent Nelson428de652008-03-18 22:41:35 +0000211 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000212 NAME 'x' (1, 0) (1, 1)
213 OP '=' (1, 2) (1, 3)
214 NUMBER '3E123' (1, 4) (1, 9)
215 >>> dump_tokens("x+y = 3e-1230")
Trent Nelson428de652008-03-18 22:41:35 +0000216 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000217 NAME 'x' (1, 0) (1, 1)
218 OP '+' (1, 1) (1, 2)
219 NAME 'y' (1, 2) (1, 3)
220 OP '=' (1, 4) (1, 5)
221 NUMBER '3e-1230' (1, 6) (1, 13)
222 >>> dump_tokens("x = 3.14e159")
Trent Nelson428de652008-03-18 22:41:35 +0000223 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000224 NAME 'x' (1, 0) (1, 1)
225 OP '=' (1, 2) (1, 3)
226 NUMBER '3.14e159' (1, 4) (1, 12)
227
228String literals
229
230 >>> dump_tokens("x = ''; y = \\\"\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000231 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000232 NAME 'x' (1, 0) (1, 1)
233 OP '=' (1, 2) (1, 3)
234 STRING "''" (1, 4) (1, 6)
235 OP ';' (1, 6) (1, 7)
236 NAME 'y' (1, 8) (1, 9)
237 OP '=' (1, 10) (1, 11)
238 STRING '""' (1, 12) (1, 14)
239 >>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000240 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000241 NAME 'x' (1, 0) (1, 1)
242 OP '=' (1, 2) (1, 3)
243 STRING '\\'"\\'' (1, 4) (1, 7)
244 OP ';' (1, 7) (1, 8)
245 NAME 'y' (1, 9) (1, 10)
246 OP '=' (1, 11) (1, 12)
247 STRING '"\\'"' (1, 13) (1, 16)
248 >>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000249 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000250 NAME 'x' (1, 0) (1, 1)
251 OP '=' (1, 2) (1, 3)
252 STRING '"doesn\\'t "' (1, 4) (1, 14)
253 NAME 'shrink' (1, 14) (1, 20)
254 STRING '", does it"' (1, 20) (1, 31)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000255 >>> dump_tokens("x = 'abc' + 'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000256 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000257 NAME 'x' (1, 0) (1, 1)
258 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000259 STRING "'abc'" (1, 4) (1, 9)
260 OP '+' (1, 10) (1, 11)
261 STRING "'ABC'" (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000262 >>> dump_tokens('y = "ABC" + "ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000263 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000264 NAME 'y' (1, 0) (1, 1)
265 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000266 STRING '"ABC"' (1, 4) (1, 9)
267 OP '+' (1, 10) (1, 11)
268 STRING '"ABC"' (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000269 >>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000270 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000271 NAME 'x' (1, 0) (1, 1)
272 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000273 STRING "r'abc'" (1, 4) (1, 10)
274 OP '+' (1, 11) (1, 12)
275 STRING "r'ABC'" (1, 13) (1, 19)
276 OP '+' (1, 20) (1, 21)
277 STRING "R'ABC'" (1, 22) (1, 28)
278 OP '+' (1, 29) (1, 30)
279 STRING "R'ABC'" (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000280 >>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000281 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000282 NAME 'y' (1, 0) (1, 1)
283 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000284 STRING 'r"abc"' (1, 4) (1, 10)
285 OP '+' (1, 11) (1, 12)
286 STRING 'r"ABC"' (1, 13) (1, 19)
287 OP '+' (1, 20) (1, 21)
288 STRING 'R"ABC"' (1, 22) (1, 28)
289 OP '+' (1, 29) (1, 30)
290 STRING 'R"ABC"' (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000291
Meador Inge8d5c0b82012-06-16 21:49:08 -0500292 >>> dump_tokens("u'abc' + U'abc'")
293 ENCODING 'utf-8' (0, 0) (0, 0)
294 STRING "u'abc'" (1, 0) (1, 6)
295 OP '+' (1, 7) (1, 8)
296 STRING "U'abc'" (1, 9) (1, 15)
297 >>> dump_tokens('u"abc" + U"abc"')
298 ENCODING 'utf-8' (0, 0) (0, 0)
299 STRING 'u"abc"' (1, 0) (1, 6)
300 OP '+' (1, 7) (1, 8)
301 STRING 'U"abc"' (1, 9) (1, 15)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500302
303 >>> dump_tokens("b'abc' + B'abc'")
304 ENCODING 'utf-8' (0, 0) (0, 0)
305 STRING "b'abc'" (1, 0) (1, 6)
306 OP '+' (1, 7) (1, 8)
307 STRING "B'abc'" (1, 9) (1, 15)
308 >>> dump_tokens('b"abc" + B"abc"')
309 ENCODING 'utf-8' (0, 0) (0, 0)
310 STRING 'b"abc"' (1, 0) (1, 6)
311 OP '+' (1, 7) (1, 8)
312 STRING 'B"abc"' (1, 9) (1, 15)
313 >>> dump_tokens("br'abc' + bR'abc' + Br'abc' + BR'abc'")
314 ENCODING 'utf-8' (0, 0) (0, 0)
315 STRING "br'abc'" (1, 0) (1, 7)
316 OP '+' (1, 8) (1, 9)
317 STRING "bR'abc'" (1, 10) (1, 17)
318 OP '+' (1, 18) (1, 19)
319 STRING "Br'abc'" (1, 20) (1, 27)
320 OP '+' (1, 28) (1, 29)
321 STRING "BR'abc'" (1, 30) (1, 37)
322 >>> dump_tokens('br"abc" + bR"abc" + Br"abc" + BR"abc"')
323 ENCODING 'utf-8' (0, 0) (0, 0)
324 STRING 'br"abc"' (1, 0) (1, 7)
325 OP '+' (1, 8) (1, 9)
326 STRING 'bR"abc"' (1, 10) (1, 17)
327 OP '+' (1, 18) (1, 19)
328 STRING 'Br"abc"' (1, 20) (1, 27)
329 OP '+' (1, 28) (1, 29)
330 STRING 'BR"abc"' (1, 30) (1, 37)
331 >>> dump_tokens("rb'abc' + rB'abc' + Rb'abc' + RB'abc'")
332 ENCODING 'utf-8' (0, 0) (0, 0)
333 STRING "rb'abc'" (1, 0) (1, 7)
334 OP '+' (1, 8) (1, 9)
335 STRING "rB'abc'" (1, 10) (1, 17)
336 OP '+' (1, 18) (1, 19)
337 STRING "Rb'abc'" (1, 20) (1, 27)
338 OP '+' (1, 28) (1, 29)
339 STRING "RB'abc'" (1, 30) (1, 37)
340 >>> dump_tokens('rb"abc" + rB"abc" + Rb"abc" + RB"abc"')
341 ENCODING 'utf-8' (0, 0) (0, 0)
342 STRING 'rb"abc"' (1, 0) (1, 7)
343 OP '+' (1, 8) (1, 9)
344 STRING 'rB"abc"' (1, 10) (1, 17)
345 OP '+' (1, 18) (1, 19)
346 STRING 'Rb"abc"' (1, 20) (1, 27)
347 OP '+' (1, 28) (1, 29)
348 STRING 'RB"abc"' (1, 30) (1, 37)
349
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000350Operators
351
352 >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000353 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000354 NAME 'def' (1, 0) (1, 3)
355 NAME 'd22' (1, 4) (1, 7)
356 OP '(' (1, 7) (1, 8)
357 NAME 'a' (1, 8) (1, 9)
358 OP ',' (1, 9) (1, 10)
359 NAME 'b' (1, 11) (1, 12)
360 OP ',' (1, 12) (1, 13)
361 NAME 'c' (1, 14) (1, 15)
362 OP '=' (1, 15) (1, 16)
363 NUMBER '2' (1, 16) (1, 17)
364 OP ',' (1, 17) (1, 18)
365 NAME 'd' (1, 19) (1, 20)
366 OP '=' (1, 20) (1, 21)
367 NUMBER '2' (1, 21) (1, 22)
368 OP ',' (1, 22) (1, 23)
369 OP '*' (1, 24) (1, 25)
370 NAME 'k' (1, 25) (1, 26)
371 OP ')' (1, 26) (1, 27)
372 OP ':' (1, 27) (1, 28)
373 NAME 'pass' (1, 29) (1, 33)
374 >>> dump_tokens("def d01v_(a=1, *k, **w): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000375 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000376 NAME 'def' (1, 0) (1, 3)
377 NAME 'd01v_' (1, 4) (1, 9)
378 OP '(' (1, 9) (1, 10)
379 NAME 'a' (1, 10) (1, 11)
380 OP '=' (1, 11) (1, 12)
381 NUMBER '1' (1, 12) (1, 13)
382 OP ',' (1, 13) (1, 14)
383 OP '*' (1, 15) (1, 16)
384 NAME 'k' (1, 16) (1, 17)
385 OP ',' (1, 17) (1, 18)
386 OP '**' (1, 19) (1, 21)
387 NAME 'w' (1, 21) (1, 22)
388 OP ')' (1, 22) (1, 23)
389 OP ':' (1, 23) (1, 24)
390 NAME 'pass' (1, 25) (1, 29)
391
392Comparison
393
394 >>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +
395 ... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")
Trent Nelson428de652008-03-18 22:41:35 +0000396 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000397 NAME 'if' (1, 0) (1, 2)
398 NUMBER '1' (1, 3) (1, 4)
399 OP '<' (1, 5) (1, 6)
400 NUMBER '1' (1, 7) (1, 8)
401 OP '>' (1, 9) (1, 10)
402 NUMBER '1' (1, 11) (1, 12)
403 OP '==' (1, 13) (1, 15)
404 NUMBER '1' (1, 16) (1, 17)
405 OP '>=' (1, 18) (1, 20)
406 NUMBER '5' (1, 21) (1, 22)
407 OP '<=' (1, 23) (1, 25)
408 NUMBER '0x15' (1, 26) (1, 30)
409 OP '<=' (1, 31) (1, 33)
410 NUMBER '0x12' (1, 34) (1, 38)
411 OP '!=' (1, 39) (1, 41)
412 NUMBER '1' (1, 42) (1, 43)
413 NAME 'and' (1, 44) (1, 47)
414 NUMBER '5' (1, 48) (1, 49)
415 NAME 'in' (1, 50) (1, 52)
416 NUMBER '1' (1, 53) (1, 54)
417 NAME 'not' (1, 55) (1, 58)
418 NAME 'in' (1, 59) (1, 61)
419 NUMBER '1' (1, 62) (1, 63)
420 NAME 'is' (1, 64) (1, 66)
421 NUMBER '1' (1, 67) (1, 68)
422 NAME 'or' (1, 69) (1, 71)
423 NUMBER '5' (1, 72) (1, 73)
424 NAME 'is' (1, 74) (1, 76)
425 NAME 'not' (1, 77) (1, 80)
426 NUMBER '1' (1, 81) (1, 82)
427 OP ':' (1, 82) (1, 83)
428 NAME 'pass' (1, 84) (1, 88)
429
430Shift
431
432 >>> dump_tokens("x = 1 << 1 >> 5")
Trent Nelson428de652008-03-18 22:41:35 +0000433 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000434 NAME 'x' (1, 0) (1, 1)
435 OP '=' (1, 2) (1, 3)
436 NUMBER '1' (1, 4) (1, 5)
437 OP '<<' (1, 6) (1, 8)
438 NUMBER '1' (1, 9) (1, 10)
439 OP '>>' (1, 11) (1, 13)
440 NUMBER '5' (1, 14) (1, 15)
441
442Additive
443
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000444 >>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")
Trent Nelson428de652008-03-18 22:41:35 +0000445 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000446 NAME 'x' (1, 0) (1, 1)
447 OP '=' (1, 2) (1, 3)
448 NUMBER '1' (1, 4) (1, 5)
449 OP '-' (1, 6) (1, 7)
450 NAME 'y' (1, 8) (1, 9)
451 OP '+' (1, 10) (1, 11)
452 NUMBER '15' (1, 12) (1, 14)
453 OP '-' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000454 NUMBER '1' (1, 17) (1, 18)
455 OP '+' (1, 19) (1, 20)
456 NUMBER '0x124' (1, 21) (1, 26)
457 OP '+' (1, 27) (1, 28)
458 NAME 'z' (1, 29) (1, 30)
459 OP '+' (1, 31) (1, 32)
460 NAME 'a' (1, 33) (1, 34)
461 OP '[' (1, 34) (1, 35)
462 NUMBER '5' (1, 35) (1, 36)
463 OP ']' (1, 36) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000464
465Multiplicative
466
Benjamin Petersond51374e2014-04-09 23:55:56 -0400467 >>> dump_tokens("x = 1//1*1/5*12%0x12@42")
Trent Nelson428de652008-03-18 22:41:35 +0000468 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000469 NAME 'x' (1, 0) (1, 1)
470 OP '=' (1, 2) (1, 3)
471 NUMBER '1' (1, 4) (1, 5)
472 OP '//' (1, 5) (1, 7)
473 NUMBER '1' (1, 7) (1, 8)
474 OP '*' (1, 8) (1, 9)
475 NUMBER '1' (1, 9) (1, 10)
476 OP '/' (1, 10) (1, 11)
477 NUMBER '5' (1, 11) (1, 12)
478 OP '*' (1, 12) (1, 13)
479 NUMBER '12' (1, 13) (1, 15)
480 OP '%' (1, 15) (1, 16)
481 NUMBER '0x12' (1, 16) (1, 20)
Benjamin Petersond51374e2014-04-09 23:55:56 -0400482 OP '@' (1, 20) (1, 21)
483 NUMBER '42' (1, 21) (1, 23)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000484
485Unary
486
487 >>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")
Trent Nelson428de652008-03-18 22:41:35 +0000488 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000489 OP '~' (1, 0) (1, 1)
490 NUMBER '1' (1, 1) (1, 2)
491 OP '^' (1, 3) (1, 4)
492 NUMBER '1' (1, 5) (1, 6)
493 OP '&' (1, 7) (1, 8)
494 NUMBER '1' (1, 9) (1, 10)
495 OP '|' (1, 11) (1, 12)
496 NUMBER '1' (1, 12) (1, 13)
497 OP '^' (1, 14) (1, 15)
498 OP '-' (1, 16) (1, 17)
499 NUMBER '1' (1, 17) (1, 18)
500 >>> dump_tokens("-1*1/1+1*1//1 - ---1**1")
Trent Nelson428de652008-03-18 22:41:35 +0000501 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000502 OP '-' (1, 0) (1, 1)
503 NUMBER '1' (1, 1) (1, 2)
504 OP '*' (1, 2) (1, 3)
505 NUMBER '1' (1, 3) (1, 4)
506 OP '/' (1, 4) (1, 5)
507 NUMBER '1' (1, 5) (1, 6)
508 OP '+' (1, 6) (1, 7)
509 NUMBER '1' (1, 7) (1, 8)
510 OP '*' (1, 8) (1, 9)
511 NUMBER '1' (1, 9) (1, 10)
512 OP '//' (1, 10) (1, 12)
513 NUMBER '1' (1, 12) (1, 13)
514 OP '-' (1, 14) (1, 15)
515 OP '-' (1, 16) (1, 17)
516 OP '-' (1, 17) (1, 18)
517 OP '-' (1, 18) (1, 19)
518 NUMBER '1' (1, 19) (1, 20)
519 OP '**' (1, 20) (1, 22)
520 NUMBER '1' (1, 22) (1, 23)
521
522Selector
523
524 >>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")
Trent Nelson428de652008-03-18 22:41:35 +0000525 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000526 NAME 'import' (1, 0) (1, 6)
527 NAME 'sys' (1, 7) (1, 10)
528 OP ',' (1, 10) (1, 11)
529 NAME 'time' (1, 12) (1, 16)
530 NEWLINE '\\n' (1, 16) (1, 17)
531 NAME 'x' (2, 0) (2, 1)
532 OP '=' (2, 2) (2, 3)
533 NAME 'sys' (2, 4) (2, 7)
534 OP '.' (2, 7) (2, 8)
535 NAME 'modules' (2, 8) (2, 15)
536 OP '[' (2, 15) (2, 16)
537 STRING "'time'" (2, 16) (2, 22)
538 OP ']' (2, 22) (2, 23)
539 OP '.' (2, 23) (2, 24)
540 NAME 'time' (2, 24) (2, 28)
541 OP '(' (2, 28) (2, 29)
542 OP ')' (2, 29) (2, 30)
543
544Methods
545
546 >>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000547 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000548 OP '@' (1, 0) (1, 1)
549 NAME 'staticmethod (1, 1) (1, 13)
550 NEWLINE '\\n' (1, 13) (1, 14)
551 NAME 'def' (2, 0) (2, 3)
552 NAME 'foo' (2, 4) (2, 7)
553 OP '(' (2, 7) (2, 8)
554 NAME 'x' (2, 8) (2, 9)
555 OP ',' (2, 9) (2, 10)
556 NAME 'y' (2, 10) (2, 11)
557 OP ')' (2, 11) (2, 12)
558 OP ':' (2, 12) (2, 13)
559 NAME 'pass' (2, 14) (2, 18)
560
561Backslash means line continuation, except for comments
562
563 >>> roundtrip("x=1+\\\\n"
564 ... "1\\n"
565 ... "# This is a comment\\\\n"
566 ... "# This also\\n")
567 True
568 >>> roundtrip("# Comment \\\\nx = 0")
569 True
Christian Heimesba4af492008-03-28 00:55:15 +0000570
571Two string literals on the same line
572
573 >>> roundtrip("'' ''")
574 True
575
576Test roundtrip on random python modules.
Antoine Pitrou5bc4fa72010-10-14 15:34:31 +0000577pass the '-ucpu' option to process the full directory.
Christian Heimesba4af492008-03-28 00:55:15 +0000578
579 >>> import random
580 >>> tempdir = os.path.dirname(f) or os.curdir
581 >>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
582
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500583Tokenize is broken on test_pep3131.py because regular expressions are
584broken on the obscure unicode identifiers in it. *sigh*
585With roundtrip extended to test the 5-tuple mode of untokenize,
5867 more testfiles fail. Remove them also until the failure is diagnosed.
587
Benjamin Peterson963e4022011-08-13 00:33:21 -0500588 >>> testfiles.remove(os.path.join(tempdir, "test_pep3131.py"))
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500589 >>> for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'):
590 ... testfiles.remove(os.path.join(tempdir, "test_%s.py") % f)
591 ...
Antoine Pitrou5bc4fa72010-10-14 15:34:31 +0000592 >>> if not support.is_resource_enabled("cpu"):
Christian Heimesba4af492008-03-28 00:55:15 +0000593 ... testfiles = random.sample(testfiles, 10)
594 ...
595 >>> for testfile in testfiles:
596 ... if not roundtrip(open(testfile, 'rb')):
597 ... print("Roundtrip failed for file %s" % testfile)
598 ... break
599 ... else: True
600 True
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000601
602Evil tabs
Benjamin Peterson33856de2010-08-30 14:41:20 +0000603
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000604 >>> dump_tokens("def f():\\n\\tif x\\n \\tpass")
605 ENCODING 'utf-8' (0, 0) (0, 0)
606 NAME 'def' (1, 0) (1, 3)
607 NAME 'f' (1, 4) (1, 5)
608 OP '(' (1, 5) (1, 6)
609 OP ')' (1, 6) (1, 7)
610 OP ':' (1, 7) (1, 8)
611 NEWLINE '\\n' (1, 8) (1, 9)
612 INDENT '\\t' (2, 0) (2, 1)
613 NAME 'if' (2, 1) (2, 3)
614 NAME 'x' (2, 4) (2, 5)
615 NEWLINE '\\n' (2, 5) (2, 6)
616 INDENT ' \\t' (3, 0) (3, 9)
617 NAME 'pass' (3, 9) (3, 13)
618 DEDENT '' (4, 0) (4, 0)
619 DEDENT '' (4, 0) (4, 0)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000620
621Non-ascii identifiers
622
623 >>> dump_tokens("Örter = 'places'\\ngrün = 'green'")
624 ENCODING 'utf-8' (0, 0) (0, 0)
625 NAME 'Örter' (1, 0) (1, 5)
626 OP '=' (1, 6) (1, 7)
627 STRING "'places'" (1, 8) (1, 16)
628 NEWLINE '\\n' (1, 16) (1, 17)
629 NAME 'grün' (2, 0) (2, 4)
630 OP '=' (2, 5) (2, 6)
631 STRING "'green'" (2, 7) (2, 14)
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000632
633Legacy unicode literals:
634
Christian Heimes0b3847d2012-06-20 11:17:58 +0200635 >>> dump_tokens("Örter = u'places'\\ngrün = U'green'")
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000636 ENCODING 'utf-8' (0, 0) (0, 0)
637 NAME 'Örter' (1, 0) (1, 5)
638 OP '=' (1, 6) (1, 7)
639 STRING "u'places'" (1, 8) (1, 17)
640 NEWLINE '\\n' (1, 17) (1, 18)
641 NAME 'grün' (2, 0) (2, 4)
642 OP '=' (2, 5) (2, 6)
Christian Heimes0b3847d2012-06-20 11:17:58 +0200643 STRING "U'green'" (2, 7) (2, 15)
Yury Selivanov75445082015-05-11 22:57:16 -0400644
645Async/await extension:
646
647 >>> dump_tokens("async = 1")
648 ENCODING 'utf-8' (0, 0) (0, 0)
649 NAME 'async' (1, 0) (1, 5)
650 OP '=' (1, 6) (1, 7)
651 NUMBER '1' (1, 8) (1, 9)
652
653 >>> dump_tokens("a = (async = 1)")
654 ENCODING 'utf-8' (0, 0) (0, 0)
655 NAME 'a' (1, 0) (1, 1)
656 OP '=' (1, 2) (1, 3)
657 OP '(' (1, 4) (1, 5)
658 NAME 'async' (1, 5) (1, 10)
659 OP '=' (1, 11) (1, 12)
660 NUMBER '1' (1, 13) (1, 14)
661 OP ')' (1, 14) (1, 15)
662
663 >>> dump_tokens("async()")
664 ENCODING 'utf-8' (0, 0) (0, 0)
665 NAME 'async' (1, 0) (1, 5)
666 OP '(' (1, 5) (1, 6)
667 OP ')' (1, 6) (1, 7)
668
669 >>> dump_tokens("class async(Bar):pass")
670 ENCODING 'utf-8' (0, 0) (0, 0)
671 NAME 'class' (1, 0) (1, 5)
672 NAME 'async' (1, 6) (1, 11)
673 OP '(' (1, 11) (1, 12)
674 NAME 'Bar' (1, 12) (1, 15)
675 OP ')' (1, 15) (1, 16)
676 OP ':' (1, 16) (1, 17)
677 NAME 'pass' (1, 17) (1, 21)
678
679 >>> dump_tokens("class async:pass")
680 ENCODING 'utf-8' (0, 0) (0, 0)
681 NAME 'class' (1, 0) (1, 5)
682 NAME 'async' (1, 6) (1, 11)
683 OP ':' (1, 11) (1, 12)
684 NAME 'pass' (1, 12) (1, 16)
685
686 >>> dump_tokens("await = 1")
687 ENCODING 'utf-8' (0, 0) (0, 0)
688 NAME 'await' (1, 0) (1, 5)
689 OP '=' (1, 6) (1, 7)
690 NUMBER '1' (1, 8) (1, 9)
691
692 >>> dump_tokens("foo.async")
693 ENCODING 'utf-8' (0, 0) (0, 0)
694 NAME 'foo' (1, 0) (1, 3)
695 OP '.' (1, 3) (1, 4)
696 NAME 'async' (1, 4) (1, 9)
697
698 >>> dump_tokens("async for a in b: pass")
699 ENCODING 'utf-8' (0, 0) (0, 0)
700 NAME 'async' (1, 0) (1, 5)
701 NAME 'for' (1, 6) (1, 9)
702 NAME 'a' (1, 10) (1, 11)
703 NAME 'in' (1, 12) (1, 14)
704 NAME 'b' (1, 15) (1, 16)
705 OP ':' (1, 16) (1, 17)
706 NAME 'pass' (1, 18) (1, 22)
707
708 >>> dump_tokens("async with a as b: pass")
709 ENCODING 'utf-8' (0, 0) (0, 0)
710 NAME 'async' (1, 0) (1, 5)
711 NAME 'with' (1, 6) (1, 10)
712 NAME 'a' (1, 11) (1, 12)
713 NAME 'as' (1, 13) (1, 15)
714 NAME 'b' (1, 16) (1, 17)
715 OP ':' (1, 17) (1, 18)
716 NAME 'pass' (1, 19) (1, 23)
717
718 >>> dump_tokens("async.foo")
719 ENCODING 'utf-8' (0, 0) (0, 0)
720 NAME 'async' (1, 0) (1, 5)
721 OP '.' (1, 5) (1, 6)
722 NAME 'foo' (1, 6) (1, 9)
723
724 >>> dump_tokens("async")
725 ENCODING 'utf-8' (0, 0) (0, 0)
726 NAME 'async' (1, 0) (1, 5)
727
728 >>> dump_tokens("async\\n#comment\\nawait")
729 ENCODING 'utf-8' (0, 0) (0, 0)
730 NAME 'async' (1, 0) (1, 5)
731 NEWLINE '\\n' (1, 5) (1, 6)
732 COMMENT '#comment' (2, 0) (2, 8)
733 NL '\\n' (2, 8) (2, 9)
734 NAME 'await' (3, 0) (3, 5)
735
736 >>> dump_tokens("async\\n...\\nawait")
737 ENCODING 'utf-8' (0, 0) (0, 0)
738 NAME 'async' (1, 0) (1, 5)
739 NEWLINE '\\n' (1, 5) (1, 6)
740 OP '...' (2, 0) (2, 3)
741 NEWLINE '\\n' (2, 3) (2, 4)
742 NAME 'await' (3, 0) (3, 5)
743
744 >>> dump_tokens("async\\nawait")
745 ENCODING 'utf-8' (0, 0) (0, 0)
746 NAME 'async' (1, 0) (1, 5)
747 NEWLINE '\\n' (1, 5) (1, 6)
748 NAME 'await' (2, 0) (2, 5)
749
750 >>> dump_tokens("foo.async + 1")
751 ENCODING 'utf-8' (0, 0) (0, 0)
752 NAME 'foo' (1, 0) (1, 3)
753 OP '.' (1, 3) (1, 4)
754 NAME 'async' (1, 4) (1, 9)
755 OP '+' (1, 10) (1, 11)
756 NUMBER '1' (1, 12) (1, 13)
757
758 >>> dump_tokens("async def foo(): pass")
759 ENCODING 'utf-8' (0, 0) (0, 0)
760 ASYNC 'async' (1, 0) (1, 5)
761 NAME 'def' (1, 6) (1, 9)
762 NAME 'foo' (1, 10) (1, 13)
763 OP '(' (1, 13) (1, 14)
764 OP ')' (1, 14) (1, 15)
765 OP ':' (1, 15) (1, 16)
766 NAME 'pass' (1, 17) (1, 21)
767
768 >>> dump_tokens('''async def foo():
769 ... def foo(await):
770 ... await = 1
771 ... if 1:
772 ... await
773 ... async += 1
774 ... ''')
775 ENCODING 'utf-8' (0, 0) (0, 0)
776 ASYNC 'async' (1, 0) (1, 5)
777 NAME 'def' (1, 6) (1, 9)
778 NAME 'foo' (1, 10) (1, 13)
779 OP '(' (1, 13) (1, 14)
780 OP ')' (1, 14) (1, 15)
781 OP ':' (1, 15) (1, 16)
782 NEWLINE '\\n' (1, 16) (1, 17)
783 INDENT ' ' (2, 0) (2, 2)
784 NAME 'def' (2, 2) (2, 5)
785 NAME 'foo' (2, 6) (2, 9)
786 OP '(' (2, 9) (2, 10)
787 NAME 'await' (2, 10) (2, 15)
788 OP ')' (2, 15) (2, 16)
789 OP ':' (2, 16) (2, 17)
790 NEWLINE '\\n' (2, 17) (2, 18)
791 INDENT ' ' (3, 0) (3, 4)
792 NAME 'await' (3, 4) (3, 9)
793 OP '=' (3, 10) (3, 11)
794 NUMBER '1' (3, 12) (3, 13)
795 NEWLINE '\\n' (3, 13) (3, 14)
796 DEDENT '' (4, 2) (4, 2)
797 NAME 'if' (4, 2) (4, 4)
798 NUMBER '1' (4, 5) (4, 6)
799 OP ':' (4, 6) (4, 7)
800 NEWLINE '\\n' (4, 7) (4, 8)
801 INDENT ' ' (5, 0) (5, 4)
802 AWAIT 'await' (5, 4) (5, 9)
803 NEWLINE '\\n' (5, 9) (5, 10)
804 DEDENT '' (6, 0) (6, 0)
805 DEDENT '' (6, 0) (6, 0)
806 NAME 'async' (6, 0) (6, 5)
807 OP '+=' (6, 6) (6, 8)
808 NUMBER '1' (6, 9) (6, 10)
809 NEWLINE '\\n' (6, 10) (6, 11)
810
811 >>> dump_tokens('''async def foo():
812 ... async for i in 1: pass''')
813 ENCODING 'utf-8' (0, 0) (0, 0)
814 ASYNC 'async' (1, 0) (1, 5)
815 NAME 'def' (1, 6) (1, 9)
816 NAME 'foo' (1, 10) (1, 13)
817 OP '(' (1, 13) (1, 14)
818 OP ')' (1, 14) (1, 15)
819 OP ':' (1, 15) (1, 16)
820 NEWLINE '\\n' (1, 16) (1, 17)
821 INDENT ' ' (2, 0) (2, 2)
822 ASYNC 'async' (2, 2) (2, 7)
823 NAME 'for' (2, 8) (2, 11)
824 NAME 'i' (2, 12) (2, 13)
825 NAME 'in' (2, 14) (2, 16)
826 NUMBER '1' (2, 17) (2, 18)
827 OP ':' (2, 18) (2, 19)
828 NAME 'pass' (2, 20) (2, 24)
829 DEDENT '' (3, 0) (3, 0)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000830"""
831
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000832from test import support
Trent Nelson428de652008-03-18 22:41:35 +0000833from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
Meador Inge00c7f852012-01-19 00:44:45 -0600834 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
Terry Jan Reedy5e6db312014-02-17 16:45:48 -0500835 open as tokenize_open, Untokenizer)
Trent Nelson428de652008-03-18 22:41:35 +0000836from io import BytesIO
Victor Stinner387729e2015-05-26 00:43:58 +0200837from unittest import TestCase, mock
Trent Nelson428de652008-03-18 22:41:35 +0000838import os, sys, glob
Meador Inge00c7f852012-01-19 00:44:45 -0600839import token
Raymond Hettinger68c04532005-06-10 11:05:19 +0000840
Thomas Wouters89f507f2006-12-13 04:49:30 +0000841def dump_tokens(s):
842 """Print out the tokens in s in a table format.
843
844 The ENDMARKER is omitted.
845 """
Trent Nelson428de652008-03-18 22:41:35 +0000846 f = BytesIO(s.encode('utf-8'))
847 for type, token, start, end, line in tokenize(f.readline):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000848 if type == ENDMARKER:
849 break
850 type = tok_name[type]
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000851 print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters89f507f2006-12-13 04:49:30 +0000852
Trent Nelson428de652008-03-18 22:41:35 +0000853def roundtrip(f):
854 """
855 Test roundtrip for `untokenize`. `f` is an open file or a string.
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500856 The source code in f is tokenized to both 5- and 2-tuples.
857 Both sequences are converted back to source code via
858 tokenize.untokenize(), and the latter tokenized again to 2-tuples.
859 The test fails if the 3 pair tokenizations do not match.
860
861 When untokenize bugs are fixed, untokenize with 5-tuples should
862 reproduce code that does not contain a backslash continuation
863 following spaces. A proper test should test this.
864
865 This function would be more useful for correcting bugs if it reported
866 the first point of failure, like assertEqual, rather than just
867 returning False -- or if it were only used in unittests and not
868 doctest and actually used assertEqual.
Trent Nelson428de652008-03-18 22:41:35 +0000869 """
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500870 # Get source code and original tokenizations
Trent Nelson428de652008-03-18 22:41:35 +0000871 if isinstance(f, str):
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500872 code = f.encode('utf-8')
873 else:
874 code = f.read()
Brian Curtin9f5f65c2010-10-30 21:35:28 +0000875 f.close()
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500876 readline = iter(code.splitlines(keepends=True)).__next__
877 tokens5 = list(tokenize(readline))
878 tokens2 = [tok[:2] for tok in tokens5]
879 # Reproduce tokens2 from pairs
880 bytes_from2 = untokenize(tokens2)
881 readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__
882 tokens2_from2 = [tok[:2] for tok in tokenize(readline2)]
883 # Reproduce tokens2 from 5-tuples
884 bytes_from5 = untokenize(tokens5)
885 readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__
886 tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
887 # Compare 3 versions
888 return tokens2 == tokens2_from2 == tokens2_from5
Thomas Wouters89f507f2006-12-13 04:49:30 +0000889
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000890# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000891def decistmt(s):
892 """Substitute Decimals for floats in a string of statements.
893
894 >>> from decimal import Decimal
Georg Brandl88fc6642007-02-09 21:28:07 +0000895 >>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000896 >>> decistmt(s)
Georg Brandl88fc6642007-02-09 21:28:07 +0000897 "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger68c04532005-06-10 11:05:19 +0000898
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000899 The format of the exponent is inherited from the platform C library.
900 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
Mark Dickinson388122d2010-08-04 20:56:28 +0000901 we're only showing 11 digits, and the 12th isn't close to 5, the
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000902 rest of the output should be platform-independent.
903
904 >>> exec(s) #doctest: +ELLIPSIS
Mark Dickinson388122d2010-08-04 20:56:28 +0000905 -3.2171603427...e-0...7
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000906
907 Output from calculations with Decimal should be identical across all
908 platforms.
909
Raymond Hettinger68c04532005-06-10 11:05:19 +0000910 >>> exec(decistmt(s))
911 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +0000912 """
913 result = []
Trent Nelson428de652008-03-18 22:41:35 +0000914 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
Raymond Hettinger68c04532005-06-10 11:05:19 +0000915 for toknum, tokval, _, _, _ in g:
916 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
917 result.extend([
918 (NAME, 'Decimal'),
919 (OP, '('),
920 (STRING, repr(tokval)),
921 (OP, ')')
922 ])
923 else:
924 result.append((toknum, tokval))
Trent Nelson428de652008-03-18 22:41:35 +0000925 return untokenize(result).decode('utf-8')
926
927
928class TestTokenizerAdheresToPep0263(TestCase):
929 """
930 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
931 """
932
933 def _testFile(self, filename):
934 path = os.path.join(os.path.dirname(__file__), filename)
935 return roundtrip(open(path, 'rb'))
936
937 def test_utf8_coding_cookie_and_no_utf8_bom(self):
Ned Deily2ea6fcc2011-07-19 16:15:27 -0700938 f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
Trent Nelson428de652008-03-18 22:41:35 +0000939 self.assertTrue(self._testFile(f))
940
941 def test_latin1_coding_cookie_and_utf8_bom(self):
942 """
943 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
944 allowed encoding for the comment is 'utf-8'. The text file used in
945 this test starts with a BOM signature, but specifies latin1 as the
946 coding, so verify that a SyntaxError is raised, which matches the
947 behaviour of the interpreter when it encounters a similar condition.
948 """
949 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000950 self.assertRaises(SyntaxError, self._testFile, f)
Trent Nelson428de652008-03-18 22:41:35 +0000951
952 def test_no_coding_cookie_and_utf8_bom(self):
953 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
954 self.assertTrue(self._testFile(f))
955
956 def test_utf8_coding_cookie_and_utf8_bom(self):
957 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
958 self.assertTrue(self._testFile(f))
959
Florent Xicluna11f0b412012-07-07 12:13:35 +0200960 def test_bad_coding_cookie(self):
961 self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')
962 self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')
963
Trent Nelson428de652008-03-18 22:41:35 +0000964
965class Test_Tokenize(TestCase):
966
967 def test__tokenize_decodes_with_specified_encoding(self):
968 literal = '"ЉЊЈЁЂ"'
969 line = literal.encode('utf-8')
970 first = False
971 def readline():
972 nonlocal first
973 if not first:
974 first = True
975 return line
976 else:
977 return b''
978
979 # skip the initial encoding token and the end token
980 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
981 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000982 self.assertEqual(tokens, expected_tokens,
983 "bytes not decoded with encoding")
Trent Nelson428de652008-03-18 22:41:35 +0000984
985 def test__tokenize_does_not_decode_with_encoding_none(self):
986 literal = '"ЉЊЈЁЂ"'
987 first = False
988 def readline():
989 nonlocal first
990 if not first:
991 first = True
992 return literal
993 else:
994 return b''
995
996 # skip the end token
997 tokens = list(_tokenize(readline, encoding=None))[:-1]
998 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000999 self.assertEqual(tokens, expected_tokens,
1000 "string not tokenized when encoding is None")
Trent Nelson428de652008-03-18 22:41:35 +00001001
1002
1003class TestDetectEncoding(TestCase):
1004
1005 def get_readline(self, lines):
1006 index = 0
1007 def readline():
1008 nonlocal index
1009 if index == len(lines):
1010 raise StopIteration
1011 line = lines[index]
1012 index += 1
1013 return line
1014 return readline
1015
1016 def test_no_bom_no_encoding_cookie(self):
1017 lines = (
1018 b'# something\n',
1019 b'print(something)\n',
1020 b'do_something(else)\n'
1021 )
1022 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001023 self.assertEqual(encoding, 'utf-8')
1024 self.assertEqual(consumed_lines, list(lines[:2]))
Trent Nelson428de652008-03-18 22:41:35 +00001025
1026 def test_bom_no_cookie(self):
1027 lines = (
1028 b'\xef\xbb\xbf# something\n',
1029 b'print(something)\n',
1030 b'do_something(else)\n'
1031 )
1032 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001033 self.assertEqual(encoding, 'utf-8-sig')
1034 self.assertEqual(consumed_lines,
1035 [b'# something\n', b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001036
1037 def test_cookie_first_line_no_bom(self):
1038 lines = (
1039 b'# -*- coding: latin-1 -*-\n',
1040 b'print(something)\n',
1041 b'do_something(else)\n'
1042 )
1043 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001044 self.assertEqual(encoding, 'iso-8859-1')
1045 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001046
1047 def test_matched_bom_and_cookie_first_line(self):
1048 lines = (
1049 b'\xef\xbb\xbf# coding=utf-8\n',
1050 b'print(something)\n',
1051 b'do_something(else)\n'
1052 )
1053 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001054 self.assertEqual(encoding, 'utf-8-sig')
1055 self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001056
1057 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
1058 lines = (
1059 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
1060 b'print(something)\n',
1061 b'do_something(else)\n'
1062 )
1063 readline = self.get_readline(lines)
1064 self.assertRaises(SyntaxError, detect_encoding, readline)
1065
1066 def test_cookie_second_line_no_bom(self):
1067 lines = (
1068 b'#! something\n',
1069 b'# vim: set fileencoding=ascii :\n',
1070 b'print(something)\n',
1071 b'do_something(else)\n'
1072 )
1073 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001074 self.assertEqual(encoding, 'ascii')
Trent Nelson428de652008-03-18 22:41:35 +00001075 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
Ezio Melottib3aedd42010-11-20 19:04:17 +00001076 self.assertEqual(consumed_lines, expected)
Trent Nelson428de652008-03-18 22:41:35 +00001077
1078 def test_matched_bom_and_cookie_second_line(self):
1079 lines = (
1080 b'\xef\xbb\xbf#! something\n',
1081 b'f# coding=utf-8\n',
1082 b'print(something)\n',
1083 b'do_something(else)\n'
1084 )
1085 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001086 self.assertEqual(encoding, 'utf-8-sig')
1087 self.assertEqual(consumed_lines,
1088 [b'#! something\n', b'f# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001089
1090 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
1091 lines = (
1092 b'\xef\xbb\xbf#! something\n',
1093 b'# vim: set fileencoding=ascii :\n',
1094 b'print(something)\n',
1095 b'do_something(else)\n'
1096 )
1097 readline = self.get_readline(lines)
1098 self.assertRaises(SyntaxError, detect_encoding, readline)
1099
Serhiy Storchaka768c16c2014-01-09 18:36:09 +02001100 def test_cookie_second_line_noncommented_first_line(self):
1101 lines = (
1102 b"print('\xc2\xa3')\n",
1103 b'# vim: set fileencoding=iso8859-15 :\n',
1104 b"print('\xe2\x82\xac')\n"
1105 )
1106 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1107 self.assertEqual(encoding, 'utf-8')
1108 expected = [b"print('\xc2\xa3')\n"]
1109 self.assertEqual(consumed_lines, expected)
1110
1111 def test_cookie_second_line_commented_first_line(self):
1112 lines = (
1113 b"#print('\xc2\xa3')\n",
1114 b'# vim: set fileencoding=iso8859-15 :\n',
1115 b"print('\xe2\x82\xac')\n"
1116 )
1117 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1118 self.assertEqual(encoding, 'iso8859-15')
1119 expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
1120 self.assertEqual(consumed_lines, expected)
1121
1122 def test_cookie_second_line_empty_first_line(self):
1123 lines = (
1124 b'\n',
1125 b'# vim: set fileencoding=iso8859-15 :\n',
1126 b"print('\xe2\x82\xac')\n"
1127 )
1128 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1129 self.assertEqual(encoding, 'iso8859-15')
1130 expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
1131 self.assertEqual(consumed_lines, expected)
1132
Benjamin Petersond3afada2009-10-09 21:43:09 +00001133 def test_latin1_normalization(self):
1134 # See get_normal_name() in tokenizer.c.
1135 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
1136 "iso-8859-1-unix", "iso-latin-1-mac")
1137 for encoding in encodings:
1138 for rep in ("-", "_"):
1139 enc = encoding.replace("-", rep)
1140 lines = (b"#!/usr/bin/python\n",
1141 b"# coding: " + enc.encode("ascii") + b"\n",
1142 b"print(things)\n",
1143 b"do_something += 4\n")
1144 rl = self.get_readline(lines)
1145 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001146 self.assertEqual(found, "iso-8859-1")
Benjamin Petersond3afada2009-10-09 21:43:09 +00001147
Martin v. Löwis63674f42012-04-20 14:36:47 +02001148 def test_syntaxerror_latin1(self):
1149 # Issue 14629: need to raise SyntaxError if the first
1150 # line(s) have non-UTF-8 characters
1151 lines = (
1152 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1153 )
1154 readline = self.get_readline(lines)
1155 self.assertRaises(SyntaxError, detect_encoding, readline)
1156
1157
Benjamin Petersond3afada2009-10-09 21:43:09 +00001158 def test_utf8_normalization(self):
1159 # See get_normal_name() in tokenizer.c.
1160 encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
1161 for encoding in encodings:
1162 for rep in ("-", "_"):
1163 enc = encoding.replace("-", rep)
1164 lines = (b"#!/usr/bin/python\n",
1165 b"# coding: " + enc.encode("ascii") + b"\n",
1166 b"1 + 3\n")
1167 rl = self.get_readline(lines)
1168 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001169 self.assertEqual(found, "utf-8")
Benjamin Petersond3afada2009-10-09 21:43:09 +00001170
Trent Nelson428de652008-03-18 22:41:35 +00001171 def test_short_files(self):
1172 readline = self.get_readline((b'print(something)\n',))
1173 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001174 self.assertEqual(encoding, 'utf-8')
1175 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001176
1177 encoding, consumed_lines = detect_encoding(self.get_readline(()))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001178 self.assertEqual(encoding, 'utf-8')
1179 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +00001180
1181 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
1182 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001183 self.assertEqual(encoding, 'utf-8-sig')
1184 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001185
1186 readline = self.get_readline((b'\xef\xbb\xbf',))
1187 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001188 self.assertEqual(encoding, 'utf-8-sig')
1189 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +00001190
Benjamin Peterson433f32c2008-12-12 01:25:05 +00001191 readline = self.get_readline((b'# coding: bad\n',))
1192 self.assertRaises(SyntaxError, detect_encoding, readline)
Trent Nelson428de652008-03-18 22:41:35 +00001193
Serhiy Storchakadafea852013-09-16 23:51:56 +03001194 def test_false_encoding(self):
1195 # Issue 18873: "Encoding" detected in non-comment lines
1196 readline = self.get_readline((b'print("#coding=fake")',))
1197 encoding, consumed_lines = detect_encoding(readline)
1198 self.assertEqual(encoding, 'utf-8')
1199 self.assertEqual(consumed_lines, [b'print("#coding=fake")'])
1200
Victor Stinner58c07522010-11-09 01:08:59 +00001201 def test_open(self):
1202 filename = support.TESTFN + '.py'
1203 self.addCleanup(support.unlink, filename)
1204
1205 # test coding cookie
1206 for encoding in ('iso-8859-15', 'utf-8'):
1207 with open(filename, 'w', encoding=encoding) as fp:
1208 print("# coding: %s" % encoding, file=fp)
1209 print("print('euro:\u20ac')", file=fp)
1210 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001211 self.assertEqual(fp.encoding, encoding)
1212 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001213
1214 # test BOM (no coding cookie)
1215 with open(filename, 'w', encoding='utf-8-sig') as fp:
1216 print("print('euro:\u20ac')", file=fp)
1217 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001218 self.assertEqual(fp.encoding, 'utf-8-sig')
1219 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001220
Brett Cannonc33f3f22012-04-20 13:23:54 -04001221 def test_filename_in_exception(self):
1222 # When possible, include the file name in the exception.
1223 path = 'some_file_path'
1224 lines = (
1225 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1226 )
1227 class Bunk:
1228 def __init__(self, lines, path):
1229 self.name = path
1230 self._lines = lines
1231 self._index = 0
1232
1233 def readline(self):
1234 if self._index == len(lines):
1235 raise StopIteration
1236 line = lines[self._index]
1237 self._index += 1
1238 return line
1239
1240 with self.assertRaises(SyntaxError):
1241 ins = Bunk(lines, path)
1242 # Make sure lacking a name isn't an issue.
1243 del ins.name
1244 detect_encoding(ins.readline)
1245 with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
1246 ins = Bunk(lines, path)
1247 detect_encoding(ins.readline)
1248
Victor Stinner387729e2015-05-26 00:43:58 +02001249 def test_open_error(self):
1250 # Issue #23840: open() must close the binary file on error
1251 m = BytesIO(b'#coding:xxx')
1252 with mock.patch('tokenize._builtin_open', return_value=m):
1253 self.assertRaises(SyntaxError, tokenize_open, 'foobar')
1254 self.assertTrue(m.closed)
1255
1256
Brett Cannonc33f3f22012-04-20 13:23:54 -04001257
Trent Nelson428de652008-03-18 22:41:35 +00001258class TestTokenize(TestCase):
1259
1260 def test_tokenize(self):
1261 import tokenize as tokenize_module
1262 encoding = object()
1263 encoding_used = None
1264 def mock_detect_encoding(readline):
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001265 return encoding, [b'first', b'second']
Trent Nelson428de652008-03-18 22:41:35 +00001266
1267 def mock__tokenize(readline, encoding):
1268 nonlocal encoding_used
1269 encoding_used = encoding
1270 out = []
1271 while True:
1272 next_line = readline()
1273 if next_line:
1274 out.append(next_line)
1275 continue
1276 return out
1277
1278 counter = 0
1279 def mock_readline():
1280 nonlocal counter
1281 counter += 1
1282 if counter == 5:
1283 return b''
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001284 return str(counter).encode()
Trent Nelson428de652008-03-18 22:41:35 +00001285
1286 orig_detect_encoding = tokenize_module.detect_encoding
1287 orig__tokenize = tokenize_module._tokenize
1288 tokenize_module.detect_encoding = mock_detect_encoding
1289 tokenize_module._tokenize = mock__tokenize
1290 try:
1291 results = tokenize(mock_readline)
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001292 self.assertEqual(list(results),
1293 [b'first', b'second', b'1', b'2', b'3', b'4'])
Trent Nelson428de652008-03-18 22:41:35 +00001294 finally:
1295 tokenize_module.detect_encoding = orig_detect_encoding
1296 tokenize_module._tokenize = orig__tokenize
1297
1298 self.assertTrue(encoding_used, encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +00001299
Yury Selivanov8085b802015-05-18 12:50:52 -04001300 def test_oneline_defs(self):
1301 buf = []
1302 for i in range(500):
1303 buf.append('def i{i}(): return {i}'.format(i=i))
1304 buf.append('OK')
1305 buf = '\n'.join(buf)
1306
1307 # Test that 500 consequent, one-line defs is OK
1308 toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
1309 self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER
1310
Meador Inge00c7f852012-01-19 00:44:45 -06001311 def assertExactTypeEqual(self, opstr, *optypes):
1312 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
1313 num_optypes = len(optypes)
1314 self.assertEqual(len(tokens), 2 + num_optypes)
1315 self.assertEqual(token.tok_name[tokens[0].exact_type],
1316 token.tok_name[ENCODING])
1317 for i in range(num_optypes):
1318 self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
1319 token.tok_name[optypes[i]])
1320 self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
1321 token.tok_name[token.ENDMARKER])
1322
1323 def test_exact_type(self):
1324 self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
1325 self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
1326 self.assertExactTypeEqual(':', token.COLON)
1327 self.assertExactTypeEqual(',', token.COMMA)
1328 self.assertExactTypeEqual(';', token.SEMI)
1329 self.assertExactTypeEqual('+', token.PLUS)
1330 self.assertExactTypeEqual('-', token.MINUS)
1331 self.assertExactTypeEqual('*', token.STAR)
1332 self.assertExactTypeEqual('/', token.SLASH)
1333 self.assertExactTypeEqual('|', token.VBAR)
1334 self.assertExactTypeEqual('&', token.AMPER)
1335 self.assertExactTypeEqual('<', token.LESS)
1336 self.assertExactTypeEqual('>', token.GREATER)
1337 self.assertExactTypeEqual('=', token.EQUAL)
1338 self.assertExactTypeEqual('.', token.DOT)
1339 self.assertExactTypeEqual('%', token.PERCENT)
1340 self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
1341 self.assertExactTypeEqual('==', token.EQEQUAL)
1342 self.assertExactTypeEqual('!=', token.NOTEQUAL)
1343 self.assertExactTypeEqual('<=', token.LESSEQUAL)
1344 self.assertExactTypeEqual('>=', token.GREATEREQUAL)
1345 self.assertExactTypeEqual('~', token.TILDE)
1346 self.assertExactTypeEqual('^', token.CIRCUMFLEX)
1347 self.assertExactTypeEqual('<<', token.LEFTSHIFT)
1348 self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
1349 self.assertExactTypeEqual('**', token.DOUBLESTAR)
1350 self.assertExactTypeEqual('+=', token.PLUSEQUAL)
1351 self.assertExactTypeEqual('-=', token.MINEQUAL)
1352 self.assertExactTypeEqual('*=', token.STAREQUAL)
1353 self.assertExactTypeEqual('/=', token.SLASHEQUAL)
1354 self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
1355 self.assertExactTypeEqual('&=', token.AMPEREQUAL)
1356 self.assertExactTypeEqual('|=', token.VBAREQUAL)
1357 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1358 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1359 self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
1360 self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
1361 self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
1362 self.assertExactTypeEqual('//', token.DOUBLESLASH)
1363 self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
1364 self.assertExactTypeEqual('@', token.AT)
Benjamin Petersond51374e2014-04-09 23:55:56 -04001365 self.assertExactTypeEqual('@=', token.ATEQUAL)
Meador Inge00c7f852012-01-19 00:44:45 -06001366
1367 self.assertExactTypeEqual('a**2+b**2==c**2',
1368 NAME, token.DOUBLESTAR, NUMBER,
1369 token.PLUS,
1370 NAME, token.DOUBLESTAR, NUMBER,
1371 token.EQEQUAL,
1372 NAME, token.DOUBLESTAR, NUMBER)
1373 self.assertExactTypeEqual('{1, 2, 3}',
1374 token.LBRACE,
1375 token.NUMBER, token.COMMA,
1376 token.NUMBER, token.COMMA,
1377 token.NUMBER,
1378 token.RBRACE)
1379 self.assertExactTypeEqual('^(x & 0x1)',
1380 token.CIRCUMFLEX,
1381 token.LPAR,
1382 token.NAME, token.AMPER, token.NUMBER,
1383 token.RPAR)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001384
Ezio Melottifafa8b72012-11-03 17:46:51 +02001385 def test_pathological_trailing_whitespace(self):
1386 # See http://bugs.python.org/issue16152
1387 self.assertExactTypeEqual('@ ', token.AT)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001388
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001389class UntokenizeTest(TestCase):
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001390
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001391 def test_bad_input_order(self):
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001392 # raise if previous row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001393 u = Untokenizer()
1394 u.prev_row = 2
1395 u.prev_col = 2
1396 with self.assertRaises(ValueError) as cm:
1397 u.add_whitespace((1,3))
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001398 self.assertEqual(cm.exception.args[0],
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001399 'start (1,3) precedes previous end (2,2)')
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001400 # raise if previous column in row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001401 self.assertRaises(ValueError, u.add_whitespace, (2,1))
1402
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001403 def test_backslash_continuation(self):
1404 # The problem is that <whitespace>\<newline> leaves no token
1405 u = Untokenizer()
1406 u.prev_row = 1
1407 u.prev_col = 1
1408 u.tokens = []
1409 u.add_whitespace((2, 0))
1410 self.assertEqual(u.tokens, ['\\\n'])
1411 u.prev_row = 2
1412 u.add_whitespace((4, 4))
1413 self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' '])
1414 self.assertTrue(roundtrip('a\n b\n c\n \\\n c\n'))
1415
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -05001416 def test_iter_compat(self):
1417 u = Untokenizer()
1418 token = (NAME, 'Hello')
1419 tokens = [(ENCODING, 'utf-8'), token]
1420 u.compat(token, iter([]))
1421 self.assertEqual(u.tokens, ["Hello "])
1422 u = Untokenizer()
1423 self.assertEqual(u.untokenize(iter([token])), 'Hello ')
1424 u = Untokenizer()
1425 self.assertEqual(u.untokenize(iter(tokens)), 'Hello ')
1426 self.assertEqual(u.encoding, 'utf-8')
1427 self.assertEqual(untokenize(iter(tokens)), b'Hello ')
1428
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001429
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001430__test__ = {"doctests" : doctests, 'decistmt': decistmt}
1431
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001432def test_main():
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001433 from test import test_tokenize
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001434 support.run_doctest(test_tokenize, True)
1435 support.run_unittest(TestTokenizerAdheresToPep0263)
1436 support.run_unittest(Test_Tokenize)
1437 support.run_unittest(TestDetectEncoding)
1438 support.run_unittest(TestTokenize)
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001439 support.run_unittest(UntokenizeTest)
Neal Norwitzc1505362006-12-28 06:47:50 +00001440
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001441if __name__ == "__main__":
1442 test_main()