blob: 308158fa47c1c01c3a0e43ca5e6f59b62b567d33 [file] [log] [blame]
Trent Nelson428de652008-03-18 22:41:35 +00001# -*- coding: utf-8 -*-
2
Christian Heimesdd15f6c2008-03-16 00:07:10 +00003doctests = """
4Tests for the tokenize module.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006The tests can be really simple. Given a small fragment of source
Eric Smith74ca5572008-03-17 19:49:19 +00007code, print out a table with tokens. The ENDMARK is omitted for
Thomas Wouters89f507f2006-12-13 04:49:30 +00008brevity.
9
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010 >>> dump_tokens("1 + 1")
Trent Nelson428de652008-03-18 22:41:35 +000011 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000012 NUMBER '1' (1, 0) (1, 1)
13 OP '+' (1, 2) (1, 3)
14 NUMBER '1' (1, 4) (1, 5)
Thomas Wouters89f507f2006-12-13 04:49:30 +000015
Christian Heimesdd15f6c2008-03-16 00:07:10 +000016 >>> dump_tokens("if False:\\n"
17 ... " # NL\\n"
18 ... " True = False # NEWLINE\\n")
Trent Nelson428de652008-03-18 22:41:35 +000019 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000020 NAME 'if' (1, 0) (1, 2)
21 NAME 'False' (1, 3) (1, 8)
22 OP ':' (1, 8) (1, 9)
23 NEWLINE '\\n' (1, 9) (1, 10)
24 COMMENT '# NL' (2, 4) (2, 8)
25 NL '\\n' (2, 8) (2, 9)
26 INDENT ' ' (3, 0) (3, 4)
27 NAME 'True' (3, 4) (3, 8)
28 OP '=' (3, 9) (3, 10)
29 NAME 'False' (3, 11) (3, 16)
30 COMMENT '# NEWLINE' (3, 17) (3, 26)
31 NEWLINE '\\n' (3, 26) (3, 27)
32 DEDENT '' (4, 0) (4, 0)
Thomas Wouters89f507f2006-12-13 04:49:30 +000033
Christian Heimesdd15f6c2008-03-16 00:07:10 +000034 >>> indent_error_file = \"""
35 ... def k(x):
36 ... x += 2
37 ... x += 5
38 ... \"""
Trent Nelson428de652008-03-18 22:41:35 +000039 >>> readline = BytesIO(indent_error_file.encode('utf-8')).readline
40 >>> for tok in tokenize(readline): pass
Christian Heimesdd15f6c2008-03-16 00:07:10 +000041 Traceback (most recent call last):
42 ...
43 IndentationError: unindent does not match any outer indentation level
Thomas Wouters89f507f2006-12-13 04:49:30 +000044
Christian Heimesdd15f6c2008-03-16 00:07:10 +000045There are some standard formattig practises that are easy to get right.
Thomas Wouters89f507f2006-12-13 04:49:30 +000046
Christian Heimesdd15f6c2008-03-16 00:07:10 +000047 >>> roundtrip("if x == 1:\\n"
48 ... " print(x)\\n")
49 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000050
Christian Heimesdd15f6c2008-03-16 00:07:10 +000051 >>> roundtrip("# This is a comment\\n# This also")
52 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000053
54Some people use different formatting conventions, which makes
Christian Heimesdd15f6c2008-03-16 00:07:10 +000055untokenize a little trickier. Note that this test involves trailing
56whitespace after the colon. Note that we use hex escapes to make the
Trent Nelson428de652008-03-18 22:41:35 +000057two trailing blanks apparent in the expected output.
Thomas Wouters89f507f2006-12-13 04:49:30 +000058
Christian Heimesdd15f6c2008-03-16 00:07:10 +000059 >>> roundtrip("if x == 1 : \\n"
60 ... " print(x)\\n")
61 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000062
Christian Heimesdd15f6c2008-03-16 00:07:10 +000063 >>> f = test_support.findfile("tokenize_tests.txt")
Trent Nelson428de652008-03-18 22:41:35 +000064 >>> roundtrip(open(f, 'rb'))
Christian Heimesdd15f6c2008-03-16 00:07:10 +000065 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000066
Christian Heimesdd15f6c2008-03-16 00:07:10 +000067 >>> roundtrip("if x == 1:\\n"
68 ... " # A comment by itself.\\n"
69 ... " print(x) # Comment here, too.\\n"
70 ... " # Another comment.\\n"
71 ... "after_if = True\\n")
72 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000073
Christian Heimesdd15f6c2008-03-16 00:07:10 +000074 >>> roundtrip("if (x # The comments need to go in the right place\\n"
75 ... " == 1):\\n"
76 ... " print('x==1')\\n")
77 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000078
Christian Heimesdd15f6c2008-03-16 00:07:10 +000079 >>> roundtrip("class Test: # A comment here\\n"
80 ... " # A comment with weird indent\\n"
81 ... " after_com = 5\\n"
82 ... " def x(m): return m*5 # a one liner\\n"
83 ... " def y(m): # A whitespace after the colon\\n"
84 ... " return y*4 # 3-space indent\\n")
85 True
86
87Some error-handling code
88
89 >>> roundtrip("try: import somemodule\\n"
90 ... "except ImportError: # comment\\n"
91 ... " print 'Can not import' # comment2\\n"
92 ... "else: print 'Loaded'\\n")
93 True
94
Eric Smith74ca5572008-03-17 19:49:19 +000095Balancing continuation
Christian Heimesdd15f6c2008-03-16 00:07:10 +000096
97 >>> roundtrip("a = (3,4, \\n"
98 ... "5,6)\\n"
99 ... "y = [3, 4,\\n"
100 ... "5]\\n"
101 ... "z = {'a': 5,\\n"
102 ... "'b':15, 'c':True}\\n"
103 ... "x = len(y) + 5 - a[\\n"
104 ... "3] - a[2]\\n"
105 ... "+ len(z) - z[\\n"
106 ... "'b']\\n")
107 True
108
109Ordinary integers and binary operators
110
111 >>> dump_tokens("0xff <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000112 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000113 NUMBER '0xff' (1, 0) (1, 4)
114 OP '<=' (1, 5) (1, 7)
115 NUMBER '255' (1, 8) (1, 11)
Eric Smith74ca5572008-03-17 19:49:19 +0000116 >>> dump_tokens("0b10 <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000117 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000118 NUMBER '0b10' (1, 0) (1, 4)
119 OP '<=' (1, 5) (1, 7)
120 NUMBER '255' (1, 8) (1, 11)
121 >>> dump_tokens("0o123 <= 0O123")
Trent Nelson428de652008-03-18 22:41:35 +0000122 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000123 NUMBER '0o123' (1, 0) (1, 5)
124 OP '<=' (1, 6) (1, 8)
125 NUMBER '0O123' (1, 9) (1, 14)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000126 >>> dump_tokens("1234567 > ~0x15")
Trent Nelson428de652008-03-18 22:41:35 +0000127 ENCODING 'utf-8' (0, 0) (0, 0)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000128 NUMBER '1234567' (1, 0) (1, 7)
129 OP '>' (1, 8) (1, 9)
130 OP '~' (1, 10) (1, 11)
131 NUMBER '0x15' (1, 11) (1, 15)
132 >>> dump_tokens("2134568 != 1231515")
Trent Nelson428de652008-03-18 22:41:35 +0000133 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000134 NUMBER '2134568' (1, 0) (1, 7)
135 OP '!=' (1, 8) (1, 10)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000136 NUMBER '1231515' (1, 11) (1, 18)
137 >>> dump_tokens("(-124561-1) & 200000000")
Trent Nelson428de652008-03-18 22:41:35 +0000138 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000139 OP '(' (1, 0) (1, 1)
140 OP '-' (1, 1) (1, 2)
141 NUMBER '124561' (1, 2) (1, 8)
142 OP '-' (1, 8) (1, 9)
143 NUMBER '1' (1, 9) (1, 10)
144 OP ')' (1, 10) (1, 11)
145 OP '&' (1, 12) (1, 13)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000146 NUMBER '200000000' (1, 14) (1, 23)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000147 >>> dump_tokens("0xdeadbeef != -1")
Trent Nelson428de652008-03-18 22:41:35 +0000148 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000149 NUMBER '0xdeadbeef' (1, 0) (1, 10)
150 OP '!=' (1, 11) (1, 13)
151 OP '-' (1, 14) (1, 15)
152 NUMBER '1' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000153 >>> dump_tokens("0xdeadc0de & 12345")
Trent Nelson428de652008-03-18 22:41:35 +0000154 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000155 NUMBER '0xdeadc0de' (1, 0) (1, 10)
156 OP '&' (1, 11) (1, 12)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000157 NUMBER '12345' (1, 13) (1, 18)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000158 >>> dump_tokens("0xFF & 0x15 | 1234")
Trent Nelson428de652008-03-18 22:41:35 +0000159 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000160 NUMBER '0xFF' (1, 0) (1, 4)
161 OP '&' (1, 5) (1, 6)
162 NUMBER '0x15' (1, 7) (1, 11)
163 OP '|' (1, 12) (1, 13)
164 NUMBER '1234' (1, 14) (1, 18)
165
166Long integers
167
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000168 >>> dump_tokens("x = 0")
Trent Nelson428de652008-03-18 22:41:35 +0000169 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000170 NAME 'x' (1, 0) (1, 1)
171 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000172 NUMBER '0' (1, 4) (1, 5)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000173 >>> dump_tokens("x = 0xfffffffffff")
Trent Nelson428de652008-03-18 22:41:35 +0000174 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000175 NAME 'x' (1, 0) (1, 1)
176 OP '=' (1, 2) (1, 3)
177 NUMBER '0xffffffffff (1, 4) (1, 17)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000178 >>> dump_tokens("x = 123141242151251616110")
Trent Nelson428de652008-03-18 22:41:35 +0000179 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000180 NAME 'x' (1, 0) (1, 1)
181 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000182 NUMBER '123141242151 (1, 4) (1, 25)
183 >>> dump_tokens("x = -15921590215012591")
Trent Nelson428de652008-03-18 22:41:35 +0000184 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000185 NAME 'x' (1, 0) (1, 1)
186 OP '=' (1, 2) (1, 3)
187 OP '-' (1, 4) (1, 5)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000188 NUMBER '159215902150 (1, 5) (1, 22)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000189
190Floating point numbers
191
192 >>> dump_tokens("x = 3.14159")
Trent Nelson428de652008-03-18 22:41:35 +0000193 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000194 NAME 'x' (1, 0) (1, 1)
195 OP '=' (1, 2) (1, 3)
196 NUMBER '3.14159' (1, 4) (1, 11)
197 >>> dump_tokens("x = 314159.")
Trent Nelson428de652008-03-18 22:41:35 +0000198 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000199 NAME 'x' (1, 0) (1, 1)
200 OP '=' (1, 2) (1, 3)
201 NUMBER '314159.' (1, 4) (1, 11)
202 >>> dump_tokens("x = .314159")
Trent Nelson428de652008-03-18 22:41:35 +0000203 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000204 NAME 'x' (1, 0) (1, 1)
205 OP '=' (1, 2) (1, 3)
206 NUMBER '.314159' (1, 4) (1, 11)
207 >>> dump_tokens("x = 3e14159")
Trent Nelson428de652008-03-18 22:41:35 +0000208 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000209 NAME 'x' (1, 0) (1, 1)
210 OP '=' (1, 2) (1, 3)
211 NUMBER '3e14159' (1, 4) (1, 11)
212 >>> dump_tokens("x = 3E123")
Trent Nelson428de652008-03-18 22:41:35 +0000213 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000214 NAME 'x' (1, 0) (1, 1)
215 OP '=' (1, 2) (1, 3)
216 NUMBER '3E123' (1, 4) (1, 9)
217 >>> dump_tokens("x+y = 3e-1230")
Trent Nelson428de652008-03-18 22:41:35 +0000218 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000219 NAME 'x' (1, 0) (1, 1)
220 OP '+' (1, 1) (1, 2)
221 NAME 'y' (1, 2) (1, 3)
222 OP '=' (1, 4) (1, 5)
223 NUMBER '3e-1230' (1, 6) (1, 13)
224 >>> dump_tokens("x = 3.14e159")
Trent Nelson428de652008-03-18 22:41:35 +0000225 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000226 NAME 'x' (1, 0) (1, 1)
227 OP '=' (1, 2) (1, 3)
228 NUMBER '3.14e159' (1, 4) (1, 12)
229
230String literals
231
232 >>> dump_tokens("x = ''; y = \\\"\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000233 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000234 NAME 'x' (1, 0) (1, 1)
235 OP '=' (1, 2) (1, 3)
236 STRING "''" (1, 4) (1, 6)
237 OP ';' (1, 6) (1, 7)
238 NAME 'y' (1, 8) (1, 9)
239 OP '=' (1, 10) (1, 11)
240 STRING '""' (1, 12) (1, 14)
241 >>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000242 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000243 NAME 'x' (1, 0) (1, 1)
244 OP '=' (1, 2) (1, 3)
245 STRING '\\'"\\'' (1, 4) (1, 7)
246 OP ';' (1, 7) (1, 8)
247 NAME 'y' (1, 9) (1, 10)
248 OP '=' (1, 11) (1, 12)
249 STRING '"\\'"' (1, 13) (1, 16)
250 >>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000251 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000252 NAME 'x' (1, 0) (1, 1)
253 OP '=' (1, 2) (1, 3)
254 STRING '"doesn\\'t "' (1, 4) (1, 14)
255 NAME 'shrink' (1, 14) (1, 20)
256 STRING '", does it"' (1, 20) (1, 31)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000257 >>> dump_tokens("x = 'abc' + 'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000258 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000259 NAME 'x' (1, 0) (1, 1)
260 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000261 STRING "'abc'" (1, 4) (1, 9)
262 OP '+' (1, 10) (1, 11)
263 STRING "'ABC'" (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000264 >>> dump_tokens('y = "ABC" + "ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000265 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000266 NAME 'y' (1, 0) (1, 1)
267 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000268 STRING '"ABC"' (1, 4) (1, 9)
269 OP '+' (1, 10) (1, 11)
270 STRING '"ABC"' (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000271 >>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000272 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000273 NAME 'x' (1, 0) (1, 1)
274 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000275 STRING "r'abc'" (1, 4) (1, 10)
276 OP '+' (1, 11) (1, 12)
277 STRING "r'ABC'" (1, 13) (1, 19)
278 OP '+' (1, 20) (1, 21)
279 STRING "R'ABC'" (1, 22) (1, 28)
280 OP '+' (1, 29) (1, 30)
281 STRING "R'ABC'" (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000282 >>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000283 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000284 NAME 'y' (1, 0) (1, 1)
285 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000286 STRING 'r"abc"' (1, 4) (1, 10)
287 OP '+' (1, 11) (1, 12)
288 STRING 'r"ABC"' (1, 13) (1, 19)
289 OP '+' (1, 20) (1, 21)
290 STRING 'R"ABC"' (1, 22) (1, 28)
291 OP '+' (1, 29) (1, 30)
292 STRING 'R"ABC"' (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000293
294Operators
295
296 >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000297 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000298 NAME 'def' (1, 0) (1, 3)
299 NAME 'd22' (1, 4) (1, 7)
300 OP '(' (1, 7) (1, 8)
301 NAME 'a' (1, 8) (1, 9)
302 OP ',' (1, 9) (1, 10)
303 NAME 'b' (1, 11) (1, 12)
304 OP ',' (1, 12) (1, 13)
305 NAME 'c' (1, 14) (1, 15)
306 OP '=' (1, 15) (1, 16)
307 NUMBER '2' (1, 16) (1, 17)
308 OP ',' (1, 17) (1, 18)
309 NAME 'd' (1, 19) (1, 20)
310 OP '=' (1, 20) (1, 21)
311 NUMBER '2' (1, 21) (1, 22)
312 OP ',' (1, 22) (1, 23)
313 OP '*' (1, 24) (1, 25)
314 NAME 'k' (1, 25) (1, 26)
315 OP ')' (1, 26) (1, 27)
316 OP ':' (1, 27) (1, 28)
317 NAME 'pass' (1, 29) (1, 33)
318 >>> dump_tokens("def d01v_(a=1, *k, **w): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000319 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000320 NAME 'def' (1, 0) (1, 3)
321 NAME 'd01v_' (1, 4) (1, 9)
322 OP '(' (1, 9) (1, 10)
323 NAME 'a' (1, 10) (1, 11)
324 OP '=' (1, 11) (1, 12)
325 NUMBER '1' (1, 12) (1, 13)
326 OP ',' (1, 13) (1, 14)
327 OP '*' (1, 15) (1, 16)
328 NAME 'k' (1, 16) (1, 17)
329 OP ',' (1, 17) (1, 18)
330 OP '**' (1, 19) (1, 21)
331 NAME 'w' (1, 21) (1, 22)
332 OP ')' (1, 22) (1, 23)
333 OP ':' (1, 23) (1, 24)
334 NAME 'pass' (1, 25) (1, 29)
335
336Comparison
337
338 >>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +
339 ... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")
Trent Nelson428de652008-03-18 22:41:35 +0000340 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000341 NAME 'if' (1, 0) (1, 2)
342 NUMBER '1' (1, 3) (1, 4)
343 OP '<' (1, 5) (1, 6)
344 NUMBER '1' (1, 7) (1, 8)
345 OP '>' (1, 9) (1, 10)
346 NUMBER '1' (1, 11) (1, 12)
347 OP '==' (1, 13) (1, 15)
348 NUMBER '1' (1, 16) (1, 17)
349 OP '>=' (1, 18) (1, 20)
350 NUMBER '5' (1, 21) (1, 22)
351 OP '<=' (1, 23) (1, 25)
352 NUMBER '0x15' (1, 26) (1, 30)
353 OP '<=' (1, 31) (1, 33)
354 NUMBER '0x12' (1, 34) (1, 38)
355 OP '!=' (1, 39) (1, 41)
356 NUMBER '1' (1, 42) (1, 43)
357 NAME 'and' (1, 44) (1, 47)
358 NUMBER '5' (1, 48) (1, 49)
359 NAME 'in' (1, 50) (1, 52)
360 NUMBER '1' (1, 53) (1, 54)
361 NAME 'not' (1, 55) (1, 58)
362 NAME 'in' (1, 59) (1, 61)
363 NUMBER '1' (1, 62) (1, 63)
364 NAME 'is' (1, 64) (1, 66)
365 NUMBER '1' (1, 67) (1, 68)
366 NAME 'or' (1, 69) (1, 71)
367 NUMBER '5' (1, 72) (1, 73)
368 NAME 'is' (1, 74) (1, 76)
369 NAME 'not' (1, 77) (1, 80)
370 NUMBER '1' (1, 81) (1, 82)
371 OP ':' (1, 82) (1, 83)
372 NAME 'pass' (1, 84) (1, 88)
373
374Shift
375
376 >>> dump_tokens("x = 1 << 1 >> 5")
Trent Nelson428de652008-03-18 22:41:35 +0000377 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000378 NAME 'x' (1, 0) (1, 1)
379 OP '=' (1, 2) (1, 3)
380 NUMBER '1' (1, 4) (1, 5)
381 OP '<<' (1, 6) (1, 8)
382 NUMBER '1' (1, 9) (1, 10)
383 OP '>>' (1, 11) (1, 13)
384 NUMBER '5' (1, 14) (1, 15)
385
386Additive
387
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000388 >>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")
Trent Nelson428de652008-03-18 22:41:35 +0000389 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000390 NAME 'x' (1, 0) (1, 1)
391 OP '=' (1, 2) (1, 3)
392 NUMBER '1' (1, 4) (1, 5)
393 OP '-' (1, 6) (1, 7)
394 NAME 'y' (1, 8) (1, 9)
395 OP '+' (1, 10) (1, 11)
396 NUMBER '15' (1, 12) (1, 14)
397 OP '-' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000398 NUMBER '1' (1, 17) (1, 18)
399 OP '+' (1, 19) (1, 20)
400 NUMBER '0x124' (1, 21) (1, 26)
401 OP '+' (1, 27) (1, 28)
402 NAME 'z' (1, 29) (1, 30)
403 OP '+' (1, 31) (1, 32)
404 NAME 'a' (1, 33) (1, 34)
405 OP '[' (1, 34) (1, 35)
406 NUMBER '5' (1, 35) (1, 36)
407 OP ']' (1, 36) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000408
409Multiplicative
410
411 >>> dump_tokens("x = 1//1*1/5*12%0x12")
Trent Nelson428de652008-03-18 22:41:35 +0000412 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000413 NAME 'x' (1, 0) (1, 1)
414 OP '=' (1, 2) (1, 3)
415 NUMBER '1' (1, 4) (1, 5)
416 OP '//' (1, 5) (1, 7)
417 NUMBER '1' (1, 7) (1, 8)
418 OP '*' (1, 8) (1, 9)
419 NUMBER '1' (1, 9) (1, 10)
420 OP '/' (1, 10) (1, 11)
421 NUMBER '5' (1, 11) (1, 12)
422 OP '*' (1, 12) (1, 13)
423 NUMBER '12' (1, 13) (1, 15)
424 OP '%' (1, 15) (1, 16)
425 NUMBER '0x12' (1, 16) (1, 20)
426
427Unary
428
429 >>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")
Trent Nelson428de652008-03-18 22:41:35 +0000430 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000431 OP '~' (1, 0) (1, 1)
432 NUMBER '1' (1, 1) (1, 2)
433 OP '^' (1, 3) (1, 4)
434 NUMBER '1' (1, 5) (1, 6)
435 OP '&' (1, 7) (1, 8)
436 NUMBER '1' (1, 9) (1, 10)
437 OP '|' (1, 11) (1, 12)
438 NUMBER '1' (1, 12) (1, 13)
439 OP '^' (1, 14) (1, 15)
440 OP '-' (1, 16) (1, 17)
441 NUMBER '1' (1, 17) (1, 18)
442 >>> dump_tokens("-1*1/1+1*1//1 - ---1**1")
Trent Nelson428de652008-03-18 22:41:35 +0000443 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000444 OP '-' (1, 0) (1, 1)
445 NUMBER '1' (1, 1) (1, 2)
446 OP '*' (1, 2) (1, 3)
447 NUMBER '1' (1, 3) (1, 4)
448 OP '/' (1, 4) (1, 5)
449 NUMBER '1' (1, 5) (1, 6)
450 OP '+' (1, 6) (1, 7)
451 NUMBER '1' (1, 7) (1, 8)
452 OP '*' (1, 8) (1, 9)
453 NUMBER '1' (1, 9) (1, 10)
454 OP '//' (1, 10) (1, 12)
455 NUMBER '1' (1, 12) (1, 13)
456 OP '-' (1, 14) (1, 15)
457 OP '-' (1, 16) (1, 17)
458 OP '-' (1, 17) (1, 18)
459 OP '-' (1, 18) (1, 19)
460 NUMBER '1' (1, 19) (1, 20)
461 OP '**' (1, 20) (1, 22)
462 NUMBER '1' (1, 22) (1, 23)
463
464Selector
465
466 >>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")
Trent Nelson428de652008-03-18 22:41:35 +0000467 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000468 NAME 'import' (1, 0) (1, 6)
469 NAME 'sys' (1, 7) (1, 10)
470 OP ',' (1, 10) (1, 11)
471 NAME 'time' (1, 12) (1, 16)
472 NEWLINE '\\n' (1, 16) (1, 17)
473 NAME 'x' (2, 0) (2, 1)
474 OP '=' (2, 2) (2, 3)
475 NAME 'sys' (2, 4) (2, 7)
476 OP '.' (2, 7) (2, 8)
477 NAME 'modules' (2, 8) (2, 15)
478 OP '[' (2, 15) (2, 16)
479 STRING "'time'" (2, 16) (2, 22)
480 OP ']' (2, 22) (2, 23)
481 OP '.' (2, 23) (2, 24)
482 NAME 'time' (2, 24) (2, 28)
483 OP '(' (2, 28) (2, 29)
484 OP ')' (2, 29) (2, 30)
485
486Methods
487
488 >>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000489 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000490 OP '@' (1, 0) (1, 1)
491 NAME 'staticmethod (1, 1) (1, 13)
492 NEWLINE '\\n' (1, 13) (1, 14)
493 NAME 'def' (2, 0) (2, 3)
494 NAME 'foo' (2, 4) (2, 7)
495 OP '(' (2, 7) (2, 8)
496 NAME 'x' (2, 8) (2, 9)
497 OP ',' (2, 9) (2, 10)
498 NAME 'y' (2, 10) (2, 11)
499 OP ')' (2, 11) (2, 12)
500 OP ':' (2, 12) (2, 13)
501 NAME 'pass' (2, 14) (2, 18)
502
503Backslash means line continuation, except for comments
504
505 >>> roundtrip("x=1+\\\\n"
506 ... "1\\n"
507 ... "# This is a comment\\\\n"
508 ... "# This also\\n")
509 True
510 >>> roundtrip("# Comment \\\\nx = 0")
511 True
Thomas Wouters89f507f2006-12-13 04:49:30 +0000512"""
513
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000514from test import test_support
Trent Nelson428de652008-03-18 22:41:35 +0000515from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
516 STRING, ENDMARKER, tok_name, detect_encoding)
517from io import BytesIO
518from unittest import TestCase
519import os, sys, glob
Raymond Hettinger68c04532005-06-10 11:05:19 +0000520
Thomas Wouters89f507f2006-12-13 04:49:30 +0000521def dump_tokens(s):
522 """Print out the tokens in s in a table format.
523
524 The ENDMARKER is omitted.
525 """
Trent Nelson428de652008-03-18 22:41:35 +0000526 f = BytesIO(s.encode('utf-8'))
527 for type, token, start, end, line in tokenize(f.readline):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000528 if type == ENDMARKER:
529 break
530 type = tok_name[type]
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000531 print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters89f507f2006-12-13 04:49:30 +0000532
Trent Nelson428de652008-03-18 22:41:35 +0000533def roundtrip(f):
534 """
535 Test roundtrip for `untokenize`. `f` is an open file or a string.
536 The source code in f is tokenized, converted back to source code via
537 tokenize.untokenize(), and tokenized again from the latter. The test
538 fails if the second tokenization doesn't match the first.
539 """
540 if isinstance(f, str):
541 f = BytesIO(f.encode('utf-8'))
542 token_list = list(tokenize(f.readline))
543 f.close()
544 tokens1 = [tok[:2] for tok in token_list]
545 new_bytes = untokenize(tokens1)
546 readline = (line for line in new_bytes.splitlines(1)).__next__
547 tokens2 = [tok[:2] for tok in tokenize(readline)]
548 return tokens1 == tokens2
Thomas Wouters89f507f2006-12-13 04:49:30 +0000549
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000550# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000551def decistmt(s):
552 """Substitute Decimals for floats in a string of statements.
553
554 >>> from decimal import Decimal
Georg Brandl88fc6642007-02-09 21:28:07 +0000555 >>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000556 >>> decistmt(s)
Georg Brandl88fc6642007-02-09 21:28:07 +0000557 "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger68c04532005-06-10 11:05:19 +0000558
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000559 The format of the exponent is inherited from the platform C library.
560 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
561 we're only showing 12 digits, and the 13th isn't close to 5, the
562 rest of the output should be platform-independent.
563
564 >>> exec(s) #doctest: +ELLIPSIS
565 -3.21716034272e-0...7
566
567 Output from calculations with Decimal should be identical across all
568 platforms.
569
Raymond Hettinger68c04532005-06-10 11:05:19 +0000570 >>> exec(decistmt(s))
571 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +0000572 """
573 result = []
Trent Nelson428de652008-03-18 22:41:35 +0000574 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
Raymond Hettinger68c04532005-06-10 11:05:19 +0000575 for toknum, tokval, _, _, _ in g:
576 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
577 result.extend([
578 (NAME, 'Decimal'),
579 (OP, '('),
580 (STRING, repr(tokval)),
581 (OP, ')')
582 ])
583 else:
584 result.append((toknum, tokval))
Trent Nelson428de652008-03-18 22:41:35 +0000585 return untokenize(result).decode('utf-8')
586
587
588class TestTokenizerAdheresToPep0263(TestCase):
589 """
590 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
591 """
592
593 def _testFile(self, filename):
594 path = os.path.join(os.path.dirname(__file__), filename)
595 return roundtrip(open(path, 'rb'))
596
597 def test_utf8_coding_cookie_and_no_utf8_bom(self):
598 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
599 self.assertTrue(self._testFile(f))
600
601 def test_latin1_coding_cookie_and_utf8_bom(self):
602 """
603 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
604 allowed encoding for the comment is 'utf-8'. The text file used in
605 this test starts with a BOM signature, but specifies latin1 as the
606 coding, so verify that a SyntaxError is raised, which matches the
607 behaviour of the interpreter when it encounters a similar condition.
608 """
609 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
610 self.failUnlessRaises(SyntaxError, self._testFile, f)
611
612 def test_no_coding_cookie_and_utf8_bom(self):
613 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
614 self.assertTrue(self._testFile(f))
615
616 def test_utf8_coding_cookie_and_utf8_bom(self):
617 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
618 self.assertTrue(self._testFile(f))
619
620
621class Test_Tokenize(TestCase):
622
623 def test__tokenize_decodes_with_specified_encoding(self):
624 literal = '"ЉЊЈЁЂ"'
625 line = literal.encode('utf-8')
626 first = False
627 def readline():
628 nonlocal first
629 if not first:
630 first = True
631 return line
632 else:
633 return b''
634
635 # skip the initial encoding token and the end token
636 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
637 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
638 self.assertEquals(tokens, expected_tokens,
639 "bytes not decoded with encoding")
640
641 def test__tokenize_does_not_decode_with_encoding_none(self):
642 literal = '"ЉЊЈЁЂ"'
643 first = False
644 def readline():
645 nonlocal first
646 if not first:
647 first = True
648 return literal
649 else:
650 return b''
651
652 # skip the end token
653 tokens = list(_tokenize(readline, encoding=None))[:-1]
654 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
655 self.assertEquals(tokens, expected_tokens,
656 "string not tokenized when encoding is None")
657
658
659class TestDetectEncoding(TestCase):
660
661 def get_readline(self, lines):
662 index = 0
663 def readline():
664 nonlocal index
665 if index == len(lines):
666 raise StopIteration
667 line = lines[index]
668 index += 1
669 return line
670 return readline
671
672 def test_no_bom_no_encoding_cookie(self):
673 lines = (
674 b'# something\n',
675 b'print(something)\n',
676 b'do_something(else)\n'
677 )
678 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
679 self.assertEquals(encoding, 'utf-8')
680 self.assertEquals(consumed_lines, list(lines[:2]))
681
682 def test_bom_no_cookie(self):
683 lines = (
684 b'\xef\xbb\xbf# something\n',
685 b'print(something)\n',
686 b'do_something(else)\n'
687 )
688 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
689 self.assertEquals(encoding, 'utf-8')
690 self.assertEquals(consumed_lines,
691 [b'# something\n', b'print(something)\n'])
692
693 def test_cookie_first_line_no_bom(self):
694 lines = (
695 b'# -*- coding: latin-1 -*-\n',
696 b'print(something)\n',
697 b'do_something(else)\n'
698 )
699 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
700 self.assertEquals(encoding, 'latin-1')
701 self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
702
703 def test_matched_bom_and_cookie_first_line(self):
704 lines = (
705 b'\xef\xbb\xbf# coding=utf-8\n',
706 b'print(something)\n',
707 b'do_something(else)\n'
708 )
709 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
710 self.assertEquals(encoding, 'utf-8')
711 self.assertEquals(consumed_lines, [b'# coding=utf-8\n'])
712
713 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
714 lines = (
715 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
716 b'print(something)\n',
717 b'do_something(else)\n'
718 )
719 readline = self.get_readline(lines)
720 self.assertRaises(SyntaxError, detect_encoding, readline)
721
722 def test_cookie_second_line_no_bom(self):
723 lines = (
724 b'#! something\n',
725 b'# vim: set fileencoding=ascii :\n',
726 b'print(something)\n',
727 b'do_something(else)\n'
728 )
729 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
730 self.assertEquals(encoding, 'ascii')
731 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
732 self.assertEquals(consumed_lines, expected)
733
734 def test_matched_bom_and_cookie_second_line(self):
735 lines = (
736 b'\xef\xbb\xbf#! something\n',
737 b'f# coding=utf-8\n',
738 b'print(something)\n',
739 b'do_something(else)\n'
740 )
741 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
742 self.assertEquals(encoding, 'utf-8')
743 self.assertEquals(consumed_lines,
744 [b'#! something\n', b'f# coding=utf-8\n'])
745
746 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
747 lines = (
748 b'\xef\xbb\xbf#! something\n',
749 b'# vim: set fileencoding=ascii :\n',
750 b'print(something)\n',
751 b'do_something(else)\n'
752 )
753 readline = self.get_readline(lines)
754 self.assertRaises(SyntaxError, detect_encoding, readline)
755
756 def test_short_files(self):
757 readline = self.get_readline((b'print(something)\n',))
758 encoding, consumed_lines = detect_encoding(readline)
759 self.assertEquals(encoding, 'utf-8')
760 self.assertEquals(consumed_lines, [b'print(something)\n'])
761
762 encoding, consumed_lines = detect_encoding(self.get_readline(()))
763 self.assertEquals(encoding, 'utf-8')
764 self.assertEquals(consumed_lines, [])
765
766 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
767 encoding, consumed_lines = detect_encoding(readline)
768 self.assertEquals(encoding, 'utf-8')
769 self.assertEquals(consumed_lines, [b'print(something)\n'])
770
771 readline = self.get_readline((b'\xef\xbb\xbf',))
772 encoding, consumed_lines = detect_encoding(readline)
773 self.assertEquals(encoding, 'utf-8')
774 self.assertEquals(consumed_lines, [])
775
776
777class TestTokenize(TestCase):
778
779 def test_tokenize(self):
780 import tokenize as tokenize_module
781 encoding = object()
782 encoding_used = None
783 def mock_detect_encoding(readline):
784 return encoding, ['first', 'second']
785
786 def mock__tokenize(readline, encoding):
787 nonlocal encoding_used
788 encoding_used = encoding
789 out = []
790 while True:
791 next_line = readline()
792 if next_line:
793 out.append(next_line)
794 continue
795 return out
796
797 counter = 0
798 def mock_readline():
799 nonlocal counter
800 counter += 1
801 if counter == 5:
802 return b''
803 return counter
804
805 orig_detect_encoding = tokenize_module.detect_encoding
806 orig__tokenize = tokenize_module._tokenize
807 tokenize_module.detect_encoding = mock_detect_encoding
808 tokenize_module._tokenize = mock__tokenize
809 try:
810 results = tokenize(mock_readline)
811 self.assertEquals(list(results), ['first', 'second', 1, 2, 3, 4])
812 finally:
813 tokenize_module.detect_encoding = orig_detect_encoding
814 tokenize_module._tokenize = orig__tokenize
815
816 self.assertTrue(encoding_used, encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000817
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000818
819__test__ = {"doctests" : doctests, 'decistmt': decistmt}
820
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000821def test_main():
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000822 from test import test_tokenize
823 test_support.run_doctest(test_tokenize, True)
Trent Nelson428de652008-03-18 22:41:35 +0000824 test_support.run_unittest(TestTokenizerAdheresToPep0263)
825 test_support.run_unittest(Test_Tokenize)
826 test_support.run_unittest(TestDetectEncoding)
827 test_support.run_unittest(TestTokenize)
Neal Norwitzc1505362006-12-28 06:47:50 +0000828
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000829if __name__ == "__main__":
830 test_main()