blob: 42fc78f4545bf9687bc05ee7b4c75a642c59fba2 [file] [log] [blame]
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001doctests = """
2Tests for the tokenize module.
Thomas Wouters89f507f2006-12-13 04:49:30 +00003
Christian Heimesdd15f6c2008-03-16 00:07:10 +00004The tests can be really simple. Given a small fragment of source
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05005code, print out a table with tokens. The ENDMARKER is omitted for
Thomas Wouters89f507f2006-12-13 04:49:30 +00006brevity.
7
Jason R. Coombs7cf36382015-06-20 19:13:50 -04008 >>> import glob
9
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010 >>> dump_tokens("1 + 1")
Trent Nelson428de652008-03-18 22:41:35 +000011 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000012 NUMBER '1' (1, 0) (1, 1)
13 OP '+' (1, 2) (1, 3)
14 NUMBER '1' (1, 4) (1, 5)
Thomas Wouters89f507f2006-12-13 04:49:30 +000015
Christian Heimesdd15f6c2008-03-16 00:07:10 +000016 >>> dump_tokens("if False:\\n"
17 ... " # NL\\n"
18 ... " True = False # NEWLINE\\n")
Trent Nelson428de652008-03-18 22:41:35 +000019 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000020 NAME 'if' (1, 0) (1, 2)
21 NAME 'False' (1, 3) (1, 8)
22 OP ':' (1, 8) (1, 9)
23 NEWLINE '\\n' (1, 9) (1, 10)
24 COMMENT '# NL' (2, 4) (2, 8)
25 NL '\\n' (2, 8) (2, 9)
26 INDENT ' ' (3, 0) (3, 4)
27 NAME 'True' (3, 4) (3, 8)
28 OP '=' (3, 9) (3, 10)
29 NAME 'False' (3, 11) (3, 16)
30 COMMENT '# NEWLINE' (3, 17) (3, 26)
31 NEWLINE '\\n' (3, 26) (3, 27)
32 DEDENT '' (4, 0) (4, 0)
Thomas Wouters89f507f2006-12-13 04:49:30 +000033
Christian Heimesdd15f6c2008-03-16 00:07:10 +000034 >>> indent_error_file = \"""
35 ... def k(x):
36 ... x += 2
37 ... x += 5
38 ... \"""
Trent Nelson428de652008-03-18 22:41:35 +000039 >>> readline = BytesIO(indent_error_file.encode('utf-8')).readline
40 >>> for tok in tokenize(readline): pass
Christian Heimesdd15f6c2008-03-16 00:07:10 +000041 Traceback (most recent call last):
42 ...
43 IndentationError: unindent does not match any outer indentation level
Thomas Wouters89f507f2006-12-13 04:49:30 +000044
Mark Dickinson3c0b3172010-06-29 07:38:37 +000045There are some standard formatting practices that are easy to get right.
Thomas Wouters89f507f2006-12-13 04:49:30 +000046
Christian Heimesdd15f6c2008-03-16 00:07:10 +000047 >>> roundtrip("if x == 1:\\n"
48 ... " print(x)\\n")
49 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000050
Christian Heimesdd15f6c2008-03-16 00:07:10 +000051 >>> roundtrip("# This is a comment\\n# This also")
52 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000053
54Some people use different formatting conventions, which makes
Christian Heimesdd15f6c2008-03-16 00:07:10 +000055untokenize a little trickier. Note that this test involves trailing
56whitespace after the colon. Note that we use hex escapes to make the
Trent Nelson428de652008-03-18 22:41:35 +000057two trailing blanks apparent in the expected output.
Thomas Wouters89f507f2006-12-13 04:49:30 +000058
Christian Heimesdd15f6c2008-03-16 00:07:10 +000059 >>> roundtrip("if x == 1 : \\n"
60 ... " print(x)\\n")
61 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000062
Benjamin Petersonee8712c2008-05-20 21:35:26 +000063 >>> f = support.findfile("tokenize_tests.txt")
Trent Nelson428de652008-03-18 22:41:35 +000064 >>> roundtrip(open(f, 'rb'))
Christian Heimesdd15f6c2008-03-16 00:07:10 +000065 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000066
Christian Heimesdd15f6c2008-03-16 00:07:10 +000067 >>> roundtrip("if x == 1:\\n"
68 ... " # A comment by itself.\\n"
69 ... " print(x) # Comment here, too.\\n"
70 ... " # Another comment.\\n"
71 ... "after_if = True\\n")
72 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000073
Christian Heimesdd15f6c2008-03-16 00:07:10 +000074 >>> roundtrip("if (x # The comments need to go in the right place\\n"
75 ... " == 1):\\n"
76 ... " print('x==1')\\n")
77 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000078
Christian Heimesdd15f6c2008-03-16 00:07:10 +000079 >>> roundtrip("class Test: # A comment here\\n"
80 ... " # A comment with weird indent\\n"
81 ... " after_com = 5\\n"
82 ... " def x(m): return m*5 # a one liner\\n"
83 ... " def y(m): # A whitespace after the colon\\n"
84 ... " return y*4 # 3-space indent\\n")
85 True
86
87Some error-handling code
88
89 >>> roundtrip("try: import somemodule\\n"
90 ... "except ImportError: # comment\\n"
Christian Heimesba4af492008-03-28 00:55:15 +000091 ... " print('Can not import' # comment2\\n)"
Neal Norwitz752abd02008-05-13 04:55:24 +000092 ... "else: print('Loaded')\\n")
Christian Heimesdd15f6c2008-03-16 00:07:10 +000093 True
94
Eric Smith74ca5572008-03-17 19:49:19 +000095Balancing continuation
Christian Heimesdd15f6c2008-03-16 00:07:10 +000096
97 >>> roundtrip("a = (3,4, \\n"
98 ... "5,6)\\n"
99 ... "y = [3, 4,\\n"
100 ... "5]\\n"
101 ... "z = {'a': 5,\\n"
102 ... "'b':15, 'c':True}\\n"
103 ... "x = len(y) + 5 - a[\\n"
104 ... "3] - a[2]\\n"
105 ... "+ len(z) - z[\\n"
106 ... "'b']\\n")
107 True
108
109Ordinary integers and binary operators
110
111 >>> dump_tokens("0xff <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000112 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000113 NUMBER '0xff' (1, 0) (1, 4)
114 OP '<=' (1, 5) (1, 7)
115 NUMBER '255' (1, 8) (1, 11)
Eric Smith74ca5572008-03-17 19:49:19 +0000116 >>> dump_tokens("0b10 <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000117 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000118 NUMBER '0b10' (1, 0) (1, 4)
119 OP '<=' (1, 5) (1, 7)
120 NUMBER '255' (1, 8) (1, 11)
121 >>> dump_tokens("0o123 <= 0O123")
Trent Nelson428de652008-03-18 22:41:35 +0000122 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000123 NUMBER '0o123' (1, 0) (1, 5)
124 OP '<=' (1, 6) (1, 8)
125 NUMBER '0O123' (1, 9) (1, 14)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000126 >>> dump_tokens("1234567 > ~0x15")
Trent Nelson428de652008-03-18 22:41:35 +0000127 ENCODING 'utf-8' (0, 0) (0, 0)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000128 NUMBER '1234567' (1, 0) (1, 7)
129 OP '>' (1, 8) (1, 9)
130 OP '~' (1, 10) (1, 11)
131 NUMBER '0x15' (1, 11) (1, 15)
132 >>> dump_tokens("2134568 != 1231515")
Trent Nelson428de652008-03-18 22:41:35 +0000133 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000134 NUMBER '2134568' (1, 0) (1, 7)
135 OP '!=' (1, 8) (1, 10)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000136 NUMBER '1231515' (1, 11) (1, 18)
137 >>> dump_tokens("(-124561-1) & 200000000")
Trent Nelson428de652008-03-18 22:41:35 +0000138 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000139 OP '(' (1, 0) (1, 1)
140 OP '-' (1, 1) (1, 2)
141 NUMBER '124561' (1, 2) (1, 8)
142 OP '-' (1, 8) (1, 9)
143 NUMBER '1' (1, 9) (1, 10)
144 OP ')' (1, 10) (1, 11)
145 OP '&' (1, 12) (1, 13)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000146 NUMBER '200000000' (1, 14) (1, 23)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000147 >>> dump_tokens("0xdeadbeef != -1")
Trent Nelson428de652008-03-18 22:41:35 +0000148 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000149 NUMBER '0xdeadbeef' (1, 0) (1, 10)
150 OP '!=' (1, 11) (1, 13)
151 OP '-' (1, 14) (1, 15)
152 NUMBER '1' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000153 >>> dump_tokens("0xdeadc0de & 12345")
Trent Nelson428de652008-03-18 22:41:35 +0000154 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000155 NUMBER '0xdeadc0de' (1, 0) (1, 10)
156 OP '&' (1, 11) (1, 12)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000157 NUMBER '12345' (1, 13) (1, 18)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000158 >>> dump_tokens("0xFF & 0x15 | 1234")
Trent Nelson428de652008-03-18 22:41:35 +0000159 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000160 NUMBER '0xFF' (1, 0) (1, 4)
161 OP '&' (1, 5) (1, 6)
162 NUMBER '0x15' (1, 7) (1, 11)
163 OP '|' (1, 12) (1, 13)
164 NUMBER '1234' (1, 14) (1, 18)
165
166Long integers
167
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000168 >>> dump_tokens("x = 0")
Trent Nelson428de652008-03-18 22:41:35 +0000169 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000170 NAME 'x' (1, 0) (1, 1)
171 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000172 NUMBER '0' (1, 4) (1, 5)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000173 >>> dump_tokens("x = 0xfffffffffff")
Trent Nelson428de652008-03-18 22:41:35 +0000174 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000175 NAME 'x' (1, 0) (1, 1)
176 OP '=' (1, 2) (1, 3)
177 NUMBER '0xffffffffff (1, 4) (1, 17)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000178 >>> dump_tokens("x = 123141242151251616110")
Trent Nelson428de652008-03-18 22:41:35 +0000179 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000180 NAME 'x' (1, 0) (1, 1)
181 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000182 NUMBER '123141242151 (1, 4) (1, 25)
183 >>> dump_tokens("x = -15921590215012591")
Trent Nelson428de652008-03-18 22:41:35 +0000184 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000185 NAME 'x' (1, 0) (1, 1)
186 OP '=' (1, 2) (1, 3)
187 OP '-' (1, 4) (1, 5)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000188 NUMBER '159215902150 (1, 5) (1, 22)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000189
190Floating point numbers
191
192 >>> dump_tokens("x = 3.14159")
Trent Nelson428de652008-03-18 22:41:35 +0000193 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000194 NAME 'x' (1, 0) (1, 1)
195 OP '=' (1, 2) (1, 3)
196 NUMBER '3.14159' (1, 4) (1, 11)
197 >>> dump_tokens("x = 314159.")
Trent Nelson428de652008-03-18 22:41:35 +0000198 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000199 NAME 'x' (1, 0) (1, 1)
200 OP '=' (1, 2) (1, 3)
201 NUMBER '314159.' (1, 4) (1, 11)
202 >>> dump_tokens("x = .314159")
Trent Nelson428de652008-03-18 22:41:35 +0000203 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000204 NAME 'x' (1, 0) (1, 1)
205 OP '=' (1, 2) (1, 3)
206 NUMBER '.314159' (1, 4) (1, 11)
207 >>> dump_tokens("x = 3e14159")
Trent Nelson428de652008-03-18 22:41:35 +0000208 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000209 NAME 'x' (1, 0) (1, 1)
210 OP '=' (1, 2) (1, 3)
211 NUMBER '3e14159' (1, 4) (1, 11)
212 >>> dump_tokens("x = 3E123")
Trent Nelson428de652008-03-18 22:41:35 +0000213 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000214 NAME 'x' (1, 0) (1, 1)
215 OP '=' (1, 2) (1, 3)
216 NUMBER '3E123' (1, 4) (1, 9)
217 >>> dump_tokens("x+y = 3e-1230")
Trent Nelson428de652008-03-18 22:41:35 +0000218 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000219 NAME 'x' (1, 0) (1, 1)
220 OP '+' (1, 1) (1, 2)
221 NAME 'y' (1, 2) (1, 3)
222 OP '=' (1, 4) (1, 5)
223 NUMBER '3e-1230' (1, 6) (1, 13)
224 >>> dump_tokens("x = 3.14e159")
Trent Nelson428de652008-03-18 22:41:35 +0000225 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000226 NAME 'x' (1, 0) (1, 1)
227 OP '=' (1, 2) (1, 3)
228 NUMBER '3.14e159' (1, 4) (1, 12)
229
230String literals
231
232 >>> dump_tokens("x = ''; y = \\\"\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000233 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000234 NAME 'x' (1, 0) (1, 1)
235 OP '=' (1, 2) (1, 3)
236 STRING "''" (1, 4) (1, 6)
237 OP ';' (1, 6) (1, 7)
238 NAME 'y' (1, 8) (1, 9)
239 OP '=' (1, 10) (1, 11)
240 STRING '""' (1, 12) (1, 14)
241 >>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000242 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000243 NAME 'x' (1, 0) (1, 1)
244 OP '=' (1, 2) (1, 3)
245 STRING '\\'"\\'' (1, 4) (1, 7)
246 OP ';' (1, 7) (1, 8)
247 NAME 'y' (1, 9) (1, 10)
248 OP '=' (1, 11) (1, 12)
249 STRING '"\\'"' (1, 13) (1, 16)
250 >>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000251 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000252 NAME 'x' (1, 0) (1, 1)
253 OP '=' (1, 2) (1, 3)
254 STRING '"doesn\\'t "' (1, 4) (1, 14)
255 NAME 'shrink' (1, 14) (1, 20)
256 STRING '", does it"' (1, 20) (1, 31)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000257 >>> dump_tokens("x = 'abc' + 'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000258 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000259 NAME 'x' (1, 0) (1, 1)
260 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000261 STRING "'abc'" (1, 4) (1, 9)
262 OP '+' (1, 10) (1, 11)
263 STRING "'ABC'" (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000264 >>> dump_tokens('y = "ABC" + "ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000265 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000266 NAME 'y' (1, 0) (1, 1)
267 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000268 STRING '"ABC"' (1, 4) (1, 9)
269 OP '+' (1, 10) (1, 11)
270 STRING '"ABC"' (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000271 >>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000272 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000273 NAME 'x' (1, 0) (1, 1)
274 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000275 STRING "r'abc'" (1, 4) (1, 10)
276 OP '+' (1, 11) (1, 12)
277 STRING "r'ABC'" (1, 13) (1, 19)
278 OP '+' (1, 20) (1, 21)
279 STRING "R'ABC'" (1, 22) (1, 28)
280 OP '+' (1, 29) (1, 30)
281 STRING "R'ABC'" (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000282 >>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000283 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000284 NAME 'y' (1, 0) (1, 1)
285 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000286 STRING 'r"abc"' (1, 4) (1, 10)
287 OP '+' (1, 11) (1, 12)
288 STRING 'r"ABC"' (1, 13) (1, 19)
289 OP '+' (1, 20) (1, 21)
290 STRING 'R"ABC"' (1, 22) (1, 28)
291 OP '+' (1, 29) (1, 30)
292 STRING 'R"ABC"' (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000293
Meador Inge8d5c0b82012-06-16 21:49:08 -0500294 >>> dump_tokens("u'abc' + U'abc'")
295 ENCODING 'utf-8' (0, 0) (0, 0)
296 STRING "u'abc'" (1, 0) (1, 6)
297 OP '+' (1, 7) (1, 8)
298 STRING "U'abc'" (1, 9) (1, 15)
299 >>> dump_tokens('u"abc" + U"abc"')
300 ENCODING 'utf-8' (0, 0) (0, 0)
301 STRING 'u"abc"' (1, 0) (1, 6)
302 OP '+' (1, 7) (1, 8)
303 STRING 'U"abc"' (1, 9) (1, 15)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500304
305 >>> dump_tokens("b'abc' + B'abc'")
306 ENCODING 'utf-8' (0, 0) (0, 0)
307 STRING "b'abc'" (1, 0) (1, 6)
308 OP '+' (1, 7) (1, 8)
309 STRING "B'abc'" (1, 9) (1, 15)
310 >>> dump_tokens('b"abc" + B"abc"')
311 ENCODING 'utf-8' (0, 0) (0, 0)
312 STRING 'b"abc"' (1, 0) (1, 6)
313 OP '+' (1, 7) (1, 8)
314 STRING 'B"abc"' (1, 9) (1, 15)
315 >>> dump_tokens("br'abc' + bR'abc' + Br'abc' + BR'abc'")
316 ENCODING 'utf-8' (0, 0) (0, 0)
317 STRING "br'abc'" (1, 0) (1, 7)
318 OP '+' (1, 8) (1, 9)
319 STRING "bR'abc'" (1, 10) (1, 17)
320 OP '+' (1, 18) (1, 19)
321 STRING "Br'abc'" (1, 20) (1, 27)
322 OP '+' (1, 28) (1, 29)
323 STRING "BR'abc'" (1, 30) (1, 37)
324 >>> dump_tokens('br"abc" + bR"abc" + Br"abc" + BR"abc"')
325 ENCODING 'utf-8' (0, 0) (0, 0)
326 STRING 'br"abc"' (1, 0) (1, 7)
327 OP '+' (1, 8) (1, 9)
328 STRING 'bR"abc"' (1, 10) (1, 17)
329 OP '+' (1, 18) (1, 19)
330 STRING 'Br"abc"' (1, 20) (1, 27)
331 OP '+' (1, 28) (1, 29)
332 STRING 'BR"abc"' (1, 30) (1, 37)
333 >>> dump_tokens("rb'abc' + rB'abc' + Rb'abc' + RB'abc'")
334 ENCODING 'utf-8' (0, 0) (0, 0)
335 STRING "rb'abc'" (1, 0) (1, 7)
336 OP '+' (1, 8) (1, 9)
337 STRING "rB'abc'" (1, 10) (1, 17)
338 OP '+' (1, 18) (1, 19)
339 STRING "Rb'abc'" (1, 20) (1, 27)
340 OP '+' (1, 28) (1, 29)
341 STRING "RB'abc'" (1, 30) (1, 37)
342 >>> dump_tokens('rb"abc" + rB"abc" + Rb"abc" + RB"abc"')
343 ENCODING 'utf-8' (0, 0) (0, 0)
344 STRING 'rb"abc"' (1, 0) (1, 7)
345 OP '+' (1, 8) (1, 9)
346 STRING 'rB"abc"' (1, 10) (1, 17)
347 OP '+' (1, 18) (1, 19)
348 STRING 'Rb"abc"' (1, 20) (1, 27)
349 OP '+' (1, 28) (1, 29)
350 STRING 'RB"abc"' (1, 30) (1, 37)
351
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000352Operators
353
354 >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000355 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000356 NAME 'def' (1, 0) (1, 3)
357 NAME 'd22' (1, 4) (1, 7)
358 OP '(' (1, 7) (1, 8)
359 NAME 'a' (1, 8) (1, 9)
360 OP ',' (1, 9) (1, 10)
361 NAME 'b' (1, 11) (1, 12)
362 OP ',' (1, 12) (1, 13)
363 NAME 'c' (1, 14) (1, 15)
364 OP '=' (1, 15) (1, 16)
365 NUMBER '2' (1, 16) (1, 17)
366 OP ',' (1, 17) (1, 18)
367 NAME 'd' (1, 19) (1, 20)
368 OP '=' (1, 20) (1, 21)
369 NUMBER '2' (1, 21) (1, 22)
370 OP ',' (1, 22) (1, 23)
371 OP '*' (1, 24) (1, 25)
372 NAME 'k' (1, 25) (1, 26)
373 OP ')' (1, 26) (1, 27)
374 OP ':' (1, 27) (1, 28)
375 NAME 'pass' (1, 29) (1, 33)
376 >>> dump_tokens("def d01v_(a=1, *k, **w): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000377 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000378 NAME 'def' (1, 0) (1, 3)
379 NAME 'd01v_' (1, 4) (1, 9)
380 OP '(' (1, 9) (1, 10)
381 NAME 'a' (1, 10) (1, 11)
382 OP '=' (1, 11) (1, 12)
383 NUMBER '1' (1, 12) (1, 13)
384 OP ',' (1, 13) (1, 14)
385 OP '*' (1, 15) (1, 16)
386 NAME 'k' (1, 16) (1, 17)
387 OP ',' (1, 17) (1, 18)
388 OP '**' (1, 19) (1, 21)
389 NAME 'w' (1, 21) (1, 22)
390 OP ')' (1, 22) (1, 23)
391 OP ':' (1, 23) (1, 24)
392 NAME 'pass' (1, 25) (1, 29)
393
394Comparison
395
396 >>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +
397 ... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")
Trent Nelson428de652008-03-18 22:41:35 +0000398 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000399 NAME 'if' (1, 0) (1, 2)
400 NUMBER '1' (1, 3) (1, 4)
401 OP '<' (1, 5) (1, 6)
402 NUMBER '1' (1, 7) (1, 8)
403 OP '>' (1, 9) (1, 10)
404 NUMBER '1' (1, 11) (1, 12)
405 OP '==' (1, 13) (1, 15)
406 NUMBER '1' (1, 16) (1, 17)
407 OP '>=' (1, 18) (1, 20)
408 NUMBER '5' (1, 21) (1, 22)
409 OP '<=' (1, 23) (1, 25)
410 NUMBER '0x15' (1, 26) (1, 30)
411 OP '<=' (1, 31) (1, 33)
412 NUMBER '0x12' (1, 34) (1, 38)
413 OP '!=' (1, 39) (1, 41)
414 NUMBER '1' (1, 42) (1, 43)
415 NAME 'and' (1, 44) (1, 47)
416 NUMBER '5' (1, 48) (1, 49)
417 NAME 'in' (1, 50) (1, 52)
418 NUMBER '1' (1, 53) (1, 54)
419 NAME 'not' (1, 55) (1, 58)
420 NAME 'in' (1, 59) (1, 61)
421 NUMBER '1' (1, 62) (1, 63)
422 NAME 'is' (1, 64) (1, 66)
423 NUMBER '1' (1, 67) (1, 68)
424 NAME 'or' (1, 69) (1, 71)
425 NUMBER '5' (1, 72) (1, 73)
426 NAME 'is' (1, 74) (1, 76)
427 NAME 'not' (1, 77) (1, 80)
428 NUMBER '1' (1, 81) (1, 82)
429 OP ':' (1, 82) (1, 83)
430 NAME 'pass' (1, 84) (1, 88)
431
432Shift
433
434 >>> dump_tokens("x = 1 << 1 >> 5")
Trent Nelson428de652008-03-18 22:41:35 +0000435 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000436 NAME 'x' (1, 0) (1, 1)
437 OP '=' (1, 2) (1, 3)
438 NUMBER '1' (1, 4) (1, 5)
439 OP '<<' (1, 6) (1, 8)
440 NUMBER '1' (1, 9) (1, 10)
441 OP '>>' (1, 11) (1, 13)
442 NUMBER '5' (1, 14) (1, 15)
443
444Additive
445
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000446 >>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")
Trent Nelson428de652008-03-18 22:41:35 +0000447 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000448 NAME 'x' (1, 0) (1, 1)
449 OP '=' (1, 2) (1, 3)
450 NUMBER '1' (1, 4) (1, 5)
451 OP '-' (1, 6) (1, 7)
452 NAME 'y' (1, 8) (1, 9)
453 OP '+' (1, 10) (1, 11)
454 NUMBER '15' (1, 12) (1, 14)
455 OP '-' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000456 NUMBER '1' (1, 17) (1, 18)
457 OP '+' (1, 19) (1, 20)
458 NUMBER '0x124' (1, 21) (1, 26)
459 OP '+' (1, 27) (1, 28)
460 NAME 'z' (1, 29) (1, 30)
461 OP '+' (1, 31) (1, 32)
462 NAME 'a' (1, 33) (1, 34)
463 OP '[' (1, 34) (1, 35)
464 NUMBER '5' (1, 35) (1, 36)
465 OP ']' (1, 36) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000466
467Multiplicative
468
Benjamin Petersond51374e2014-04-09 23:55:56 -0400469 >>> dump_tokens("x = 1//1*1/5*12%0x12@42")
Trent Nelson428de652008-03-18 22:41:35 +0000470 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000471 NAME 'x' (1, 0) (1, 1)
472 OP '=' (1, 2) (1, 3)
473 NUMBER '1' (1, 4) (1, 5)
474 OP '//' (1, 5) (1, 7)
475 NUMBER '1' (1, 7) (1, 8)
476 OP '*' (1, 8) (1, 9)
477 NUMBER '1' (1, 9) (1, 10)
478 OP '/' (1, 10) (1, 11)
479 NUMBER '5' (1, 11) (1, 12)
480 OP '*' (1, 12) (1, 13)
481 NUMBER '12' (1, 13) (1, 15)
482 OP '%' (1, 15) (1, 16)
483 NUMBER '0x12' (1, 16) (1, 20)
Benjamin Petersond51374e2014-04-09 23:55:56 -0400484 OP '@' (1, 20) (1, 21)
485 NUMBER '42' (1, 21) (1, 23)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000486
487Unary
488
489 >>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")
Trent Nelson428de652008-03-18 22:41:35 +0000490 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000491 OP '~' (1, 0) (1, 1)
492 NUMBER '1' (1, 1) (1, 2)
493 OP '^' (1, 3) (1, 4)
494 NUMBER '1' (1, 5) (1, 6)
495 OP '&' (1, 7) (1, 8)
496 NUMBER '1' (1, 9) (1, 10)
497 OP '|' (1, 11) (1, 12)
498 NUMBER '1' (1, 12) (1, 13)
499 OP '^' (1, 14) (1, 15)
500 OP '-' (1, 16) (1, 17)
501 NUMBER '1' (1, 17) (1, 18)
502 >>> dump_tokens("-1*1/1+1*1//1 - ---1**1")
Trent Nelson428de652008-03-18 22:41:35 +0000503 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000504 OP '-' (1, 0) (1, 1)
505 NUMBER '1' (1, 1) (1, 2)
506 OP '*' (1, 2) (1, 3)
507 NUMBER '1' (1, 3) (1, 4)
508 OP '/' (1, 4) (1, 5)
509 NUMBER '1' (1, 5) (1, 6)
510 OP '+' (1, 6) (1, 7)
511 NUMBER '1' (1, 7) (1, 8)
512 OP '*' (1, 8) (1, 9)
513 NUMBER '1' (1, 9) (1, 10)
514 OP '//' (1, 10) (1, 12)
515 NUMBER '1' (1, 12) (1, 13)
516 OP '-' (1, 14) (1, 15)
517 OP '-' (1, 16) (1, 17)
518 OP '-' (1, 17) (1, 18)
519 OP '-' (1, 18) (1, 19)
520 NUMBER '1' (1, 19) (1, 20)
521 OP '**' (1, 20) (1, 22)
522 NUMBER '1' (1, 22) (1, 23)
523
524Selector
525
526 >>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")
Trent Nelson428de652008-03-18 22:41:35 +0000527 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000528 NAME 'import' (1, 0) (1, 6)
529 NAME 'sys' (1, 7) (1, 10)
530 OP ',' (1, 10) (1, 11)
531 NAME 'time' (1, 12) (1, 16)
532 NEWLINE '\\n' (1, 16) (1, 17)
533 NAME 'x' (2, 0) (2, 1)
534 OP '=' (2, 2) (2, 3)
535 NAME 'sys' (2, 4) (2, 7)
536 OP '.' (2, 7) (2, 8)
537 NAME 'modules' (2, 8) (2, 15)
538 OP '[' (2, 15) (2, 16)
539 STRING "'time'" (2, 16) (2, 22)
540 OP ']' (2, 22) (2, 23)
541 OP '.' (2, 23) (2, 24)
542 NAME 'time' (2, 24) (2, 28)
543 OP '(' (2, 28) (2, 29)
544 OP ')' (2, 29) (2, 30)
545
546Methods
547
548 >>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000549 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000550 OP '@' (1, 0) (1, 1)
551 NAME 'staticmethod (1, 1) (1, 13)
552 NEWLINE '\\n' (1, 13) (1, 14)
553 NAME 'def' (2, 0) (2, 3)
554 NAME 'foo' (2, 4) (2, 7)
555 OP '(' (2, 7) (2, 8)
556 NAME 'x' (2, 8) (2, 9)
557 OP ',' (2, 9) (2, 10)
558 NAME 'y' (2, 10) (2, 11)
559 OP ')' (2, 11) (2, 12)
560 OP ':' (2, 12) (2, 13)
561 NAME 'pass' (2, 14) (2, 18)
562
563Backslash means line continuation, except for comments
564
565 >>> roundtrip("x=1+\\\\n"
566 ... "1\\n"
567 ... "# This is a comment\\\\n"
568 ... "# This also\\n")
569 True
570 >>> roundtrip("# Comment \\\\nx = 0")
571 True
Christian Heimesba4af492008-03-28 00:55:15 +0000572
573Two string literals on the same line
574
575 >>> roundtrip("'' ''")
576 True
577
578Test roundtrip on random python modules.
Antoine Pitrou5bc4fa72010-10-14 15:34:31 +0000579pass the '-ucpu' option to process the full directory.
Christian Heimesba4af492008-03-28 00:55:15 +0000580
581 >>> import random
582 >>> tempdir = os.path.dirname(f) or os.curdir
583 >>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
584
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500585Tokenize is broken on test_pep3131.py because regular expressions are
586broken on the obscure unicode identifiers in it. *sigh*
587With roundtrip extended to test the 5-tuple mode of untokenize,
5887 more testfiles fail. Remove them also until the failure is diagnosed.
589
Benjamin Peterson963e4022011-08-13 00:33:21 -0500590 >>> testfiles.remove(os.path.join(tempdir, "test_pep3131.py"))
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500591 >>> for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'):
592 ... testfiles.remove(os.path.join(tempdir, "test_%s.py") % f)
593 ...
Antoine Pitrou5bc4fa72010-10-14 15:34:31 +0000594 >>> if not support.is_resource_enabled("cpu"):
Christian Heimesba4af492008-03-28 00:55:15 +0000595 ... testfiles = random.sample(testfiles, 10)
596 ...
597 >>> for testfile in testfiles:
598 ... if not roundtrip(open(testfile, 'rb')):
599 ... print("Roundtrip failed for file %s" % testfile)
600 ... break
601 ... else: True
602 True
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000603
604Evil tabs
Benjamin Peterson33856de2010-08-30 14:41:20 +0000605
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000606 >>> dump_tokens("def f():\\n\\tif x\\n \\tpass")
607 ENCODING 'utf-8' (0, 0) (0, 0)
608 NAME 'def' (1, 0) (1, 3)
609 NAME 'f' (1, 4) (1, 5)
610 OP '(' (1, 5) (1, 6)
611 OP ')' (1, 6) (1, 7)
612 OP ':' (1, 7) (1, 8)
613 NEWLINE '\\n' (1, 8) (1, 9)
614 INDENT '\\t' (2, 0) (2, 1)
615 NAME 'if' (2, 1) (2, 3)
616 NAME 'x' (2, 4) (2, 5)
617 NEWLINE '\\n' (2, 5) (2, 6)
618 INDENT ' \\t' (3, 0) (3, 9)
619 NAME 'pass' (3, 9) (3, 13)
620 DEDENT '' (4, 0) (4, 0)
621 DEDENT '' (4, 0) (4, 0)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000622
623Non-ascii identifiers
624
625 >>> dump_tokens("Örter = 'places'\\ngrün = 'green'")
626 ENCODING 'utf-8' (0, 0) (0, 0)
627 NAME 'Örter' (1, 0) (1, 5)
628 OP '=' (1, 6) (1, 7)
629 STRING "'places'" (1, 8) (1, 16)
630 NEWLINE '\\n' (1, 16) (1, 17)
631 NAME 'grün' (2, 0) (2, 4)
632 OP '=' (2, 5) (2, 6)
633 STRING "'green'" (2, 7) (2, 14)
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000634
635Legacy unicode literals:
636
Christian Heimes0b3847d2012-06-20 11:17:58 +0200637 >>> dump_tokens("Örter = u'places'\\ngrün = U'green'")
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000638 ENCODING 'utf-8' (0, 0) (0, 0)
639 NAME 'Örter' (1, 0) (1, 5)
640 OP '=' (1, 6) (1, 7)
641 STRING "u'places'" (1, 8) (1, 17)
642 NEWLINE '\\n' (1, 17) (1, 18)
643 NAME 'grün' (2, 0) (2, 4)
644 OP '=' (2, 5) (2, 6)
Christian Heimes0b3847d2012-06-20 11:17:58 +0200645 STRING "U'green'" (2, 7) (2, 15)
Yury Selivanov75445082015-05-11 22:57:16 -0400646
647Async/await extension:
648
649 >>> dump_tokens("async = 1")
650 ENCODING 'utf-8' (0, 0) (0, 0)
651 NAME 'async' (1, 0) (1, 5)
652 OP '=' (1, 6) (1, 7)
653 NUMBER '1' (1, 8) (1, 9)
654
655 >>> dump_tokens("a = (async = 1)")
656 ENCODING 'utf-8' (0, 0) (0, 0)
657 NAME 'a' (1, 0) (1, 1)
658 OP '=' (1, 2) (1, 3)
659 OP '(' (1, 4) (1, 5)
660 NAME 'async' (1, 5) (1, 10)
661 OP '=' (1, 11) (1, 12)
662 NUMBER '1' (1, 13) (1, 14)
663 OP ')' (1, 14) (1, 15)
664
665 >>> dump_tokens("async()")
666 ENCODING 'utf-8' (0, 0) (0, 0)
667 NAME 'async' (1, 0) (1, 5)
668 OP '(' (1, 5) (1, 6)
669 OP ')' (1, 6) (1, 7)
670
671 >>> dump_tokens("class async(Bar):pass")
672 ENCODING 'utf-8' (0, 0) (0, 0)
673 NAME 'class' (1, 0) (1, 5)
674 NAME 'async' (1, 6) (1, 11)
675 OP '(' (1, 11) (1, 12)
676 NAME 'Bar' (1, 12) (1, 15)
677 OP ')' (1, 15) (1, 16)
678 OP ':' (1, 16) (1, 17)
679 NAME 'pass' (1, 17) (1, 21)
680
681 >>> dump_tokens("class async:pass")
682 ENCODING 'utf-8' (0, 0) (0, 0)
683 NAME 'class' (1, 0) (1, 5)
684 NAME 'async' (1, 6) (1, 11)
685 OP ':' (1, 11) (1, 12)
686 NAME 'pass' (1, 12) (1, 16)
687
688 >>> dump_tokens("await = 1")
689 ENCODING 'utf-8' (0, 0) (0, 0)
690 NAME 'await' (1, 0) (1, 5)
691 OP '=' (1, 6) (1, 7)
692 NUMBER '1' (1, 8) (1, 9)
693
694 >>> dump_tokens("foo.async")
695 ENCODING 'utf-8' (0, 0) (0, 0)
696 NAME 'foo' (1, 0) (1, 3)
697 OP '.' (1, 3) (1, 4)
698 NAME 'async' (1, 4) (1, 9)
699
700 >>> dump_tokens("async for a in b: pass")
701 ENCODING 'utf-8' (0, 0) (0, 0)
702 NAME 'async' (1, 0) (1, 5)
703 NAME 'for' (1, 6) (1, 9)
704 NAME 'a' (1, 10) (1, 11)
705 NAME 'in' (1, 12) (1, 14)
706 NAME 'b' (1, 15) (1, 16)
707 OP ':' (1, 16) (1, 17)
708 NAME 'pass' (1, 18) (1, 22)
709
710 >>> dump_tokens("async with a as b: pass")
711 ENCODING 'utf-8' (0, 0) (0, 0)
712 NAME 'async' (1, 0) (1, 5)
713 NAME 'with' (1, 6) (1, 10)
714 NAME 'a' (1, 11) (1, 12)
715 NAME 'as' (1, 13) (1, 15)
716 NAME 'b' (1, 16) (1, 17)
717 OP ':' (1, 17) (1, 18)
718 NAME 'pass' (1, 19) (1, 23)
719
720 >>> dump_tokens("async.foo")
721 ENCODING 'utf-8' (0, 0) (0, 0)
722 NAME 'async' (1, 0) (1, 5)
723 OP '.' (1, 5) (1, 6)
724 NAME 'foo' (1, 6) (1, 9)
725
726 >>> dump_tokens("async")
727 ENCODING 'utf-8' (0, 0) (0, 0)
728 NAME 'async' (1, 0) (1, 5)
729
730 >>> dump_tokens("async\\n#comment\\nawait")
731 ENCODING 'utf-8' (0, 0) (0, 0)
732 NAME 'async' (1, 0) (1, 5)
733 NEWLINE '\\n' (1, 5) (1, 6)
734 COMMENT '#comment' (2, 0) (2, 8)
735 NL '\\n' (2, 8) (2, 9)
736 NAME 'await' (3, 0) (3, 5)
737
738 >>> dump_tokens("async\\n...\\nawait")
739 ENCODING 'utf-8' (0, 0) (0, 0)
740 NAME 'async' (1, 0) (1, 5)
741 NEWLINE '\\n' (1, 5) (1, 6)
742 OP '...' (2, 0) (2, 3)
743 NEWLINE '\\n' (2, 3) (2, 4)
744 NAME 'await' (3, 0) (3, 5)
745
746 >>> dump_tokens("async\\nawait")
747 ENCODING 'utf-8' (0, 0) (0, 0)
748 NAME 'async' (1, 0) (1, 5)
749 NEWLINE '\\n' (1, 5) (1, 6)
750 NAME 'await' (2, 0) (2, 5)
751
752 >>> dump_tokens("foo.async + 1")
753 ENCODING 'utf-8' (0, 0) (0, 0)
754 NAME 'foo' (1, 0) (1, 3)
755 OP '.' (1, 3) (1, 4)
756 NAME 'async' (1, 4) (1, 9)
757 OP '+' (1, 10) (1, 11)
758 NUMBER '1' (1, 12) (1, 13)
759
760 >>> dump_tokens("async def foo(): pass")
761 ENCODING 'utf-8' (0, 0) (0, 0)
762 ASYNC 'async' (1, 0) (1, 5)
763 NAME 'def' (1, 6) (1, 9)
764 NAME 'foo' (1, 10) (1, 13)
765 OP '(' (1, 13) (1, 14)
766 OP ')' (1, 14) (1, 15)
767 OP ':' (1, 15) (1, 16)
768 NAME 'pass' (1, 17) (1, 21)
769
770 >>> dump_tokens('''async def foo():
771 ... def foo(await):
772 ... await = 1
773 ... if 1:
774 ... await
775 ... async += 1
776 ... ''')
777 ENCODING 'utf-8' (0, 0) (0, 0)
778 ASYNC 'async' (1, 0) (1, 5)
779 NAME 'def' (1, 6) (1, 9)
780 NAME 'foo' (1, 10) (1, 13)
781 OP '(' (1, 13) (1, 14)
782 OP ')' (1, 14) (1, 15)
783 OP ':' (1, 15) (1, 16)
784 NEWLINE '\\n' (1, 16) (1, 17)
785 INDENT ' ' (2, 0) (2, 2)
786 NAME 'def' (2, 2) (2, 5)
787 NAME 'foo' (2, 6) (2, 9)
788 OP '(' (2, 9) (2, 10)
789 NAME 'await' (2, 10) (2, 15)
790 OP ')' (2, 15) (2, 16)
791 OP ':' (2, 16) (2, 17)
792 NEWLINE '\\n' (2, 17) (2, 18)
793 INDENT ' ' (3, 0) (3, 4)
794 NAME 'await' (3, 4) (3, 9)
795 OP '=' (3, 10) (3, 11)
796 NUMBER '1' (3, 12) (3, 13)
797 NEWLINE '\\n' (3, 13) (3, 14)
798 DEDENT '' (4, 2) (4, 2)
799 NAME 'if' (4, 2) (4, 4)
800 NUMBER '1' (4, 5) (4, 6)
801 OP ':' (4, 6) (4, 7)
802 NEWLINE '\\n' (4, 7) (4, 8)
803 INDENT ' ' (5, 0) (5, 4)
804 AWAIT 'await' (5, 4) (5, 9)
805 NEWLINE '\\n' (5, 9) (5, 10)
806 DEDENT '' (6, 0) (6, 0)
807 DEDENT '' (6, 0) (6, 0)
808 NAME 'async' (6, 0) (6, 5)
809 OP '+=' (6, 6) (6, 8)
810 NUMBER '1' (6, 9) (6, 10)
811 NEWLINE '\\n' (6, 10) (6, 11)
812
813 >>> dump_tokens('''async def foo():
814 ... async for i in 1: pass''')
815 ENCODING 'utf-8' (0, 0) (0, 0)
816 ASYNC 'async' (1, 0) (1, 5)
817 NAME 'def' (1, 6) (1, 9)
818 NAME 'foo' (1, 10) (1, 13)
819 OP '(' (1, 13) (1, 14)
820 OP ')' (1, 14) (1, 15)
821 OP ':' (1, 15) (1, 16)
822 NEWLINE '\\n' (1, 16) (1, 17)
823 INDENT ' ' (2, 0) (2, 2)
824 ASYNC 'async' (2, 2) (2, 7)
825 NAME 'for' (2, 8) (2, 11)
826 NAME 'i' (2, 12) (2, 13)
827 NAME 'in' (2, 14) (2, 16)
828 NUMBER '1' (2, 17) (2, 18)
829 OP ':' (2, 18) (2, 19)
830 NAME 'pass' (2, 20) (2, 24)
831 DEDENT '' (3, 0) (3, 0)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000832"""
833
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000834from test import support
Trent Nelson428de652008-03-18 22:41:35 +0000835from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
Meador Inge00c7f852012-01-19 00:44:45 -0600836 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
Terry Jan Reedy5e6db312014-02-17 16:45:48 -0500837 open as tokenize_open, Untokenizer)
Trent Nelson428de652008-03-18 22:41:35 +0000838from io import BytesIO
Victor Stinner387729e2015-05-26 00:43:58 +0200839from unittest import TestCase, mock
Jason R. Coombs7cf36382015-06-20 19:13:50 -0400840import os
Meador Inge00c7f852012-01-19 00:44:45 -0600841import token
Raymond Hettinger68c04532005-06-10 11:05:19 +0000842
Thomas Wouters89f507f2006-12-13 04:49:30 +0000843def dump_tokens(s):
844 """Print out the tokens in s in a table format.
845
846 The ENDMARKER is omitted.
847 """
Trent Nelson428de652008-03-18 22:41:35 +0000848 f = BytesIO(s.encode('utf-8'))
849 for type, token, start, end, line in tokenize(f.readline):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000850 if type == ENDMARKER:
851 break
852 type = tok_name[type]
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000853 print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters89f507f2006-12-13 04:49:30 +0000854
Trent Nelson428de652008-03-18 22:41:35 +0000855def roundtrip(f):
856 """
857 Test roundtrip for `untokenize`. `f` is an open file or a string.
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500858 The source code in f is tokenized to both 5- and 2-tuples.
859 Both sequences are converted back to source code via
860 tokenize.untokenize(), and the latter tokenized again to 2-tuples.
861 The test fails if the 3 pair tokenizations do not match.
862
863 When untokenize bugs are fixed, untokenize with 5-tuples should
864 reproduce code that does not contain a backslash continuation
865 following spaces. A proper test should test this.
866
867 This function would be more useful for correcting bugs if it reported
868 the first point of failure, like assertEqual, rather than just
869 returning False -- or if it were only used in unittests and not
870 doctest and actually used assertEqual.
Trent Nelson428de652008-03-18 22:41:35 +0000871 """
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500872 # Get source code and original tokenizations
Trent Nelson428de652008-03-18 22:41:35 +0000873 if isinstance(f, str):
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500874 code = f.encode('utf-8')
875 else:
876 code = f.read()
Brian Curtin9f5f65c2010-10-30 21:35:28 +0000877 f.close()
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500878 readline = iter(code.splitlines(keepends=True)).__next__
879 tokens5 = list(tokenize(readline))
880 tokens2 = [tok[:2] for tok in tokens5]
881 # Reproduce tokens2 from pairs
882 bytes_from2 = untokenize(tokens2)
883 readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__
884 tokens2_from2 = [tok[:2] for tok in tokenize(readline2)]
885 # Reproduce tokens2 from 5-tuples
886 bytes_from5 = untokenize(tokens5)
887 readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__
888 tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
889 # Compare 3 versions
890 return tokens2 == tokens2_from2 == tokens2_from5
Thomas Wouters89f507f2006-12-13 04:49:30 +0000891
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000892# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000893def decistmt(s):
894 """Substitute Decimals for floats in a string of statements.
895
896 >>> from decimal import Decimal
Georg Brandl88fc6642007-02-09 21:28:07 +0000897 >>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000898 >>> decistmt(s)
Georg Brandl88fc6642007-02-09 21:28:07 +0000899 "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger68c04532005-06-10 11:05:19 +0000900
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000901 The format of the exponent is inherited from the platform C library.
902 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
Mark Dickinson388122d2010-08-04 20:56:28 +0000903 we're only showing 11 digits, and the 12th isn't close to 5, the
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000904 rest of the output should be platform-independent.
905
906 >>> exec(s) #doctest: +ELLIPSIS
Mark Dickinson388122d2010-08-04 20:56:28 +0000907 -3.2171603427...e-0...7
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000908
909 Output from calculations with Decimal should be identical across all
910 platforms.
911
Raymond Hettinger68c04532005-06-10 11:05:19 +0000912 >>> exec(decistmt(s))
913 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +0000914 """
915 result = []
Trent Nelson428de652008-03-18 22:41:35 +0000916 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
Raymond Hettinger68c04532005-06-10 11:05:19 +0000917 for toknum, tokval, _, _, _ in g:
918 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
919 result.extend([
920 (NAME, 'Decimal'),
921 (OP, '('),
922 (STRING, repr(tokval)),
923 (OP, ')')
924 ])
925 else:
926 result.append((toknum, tokval))
Trent Nelson428de652008-03-18 22:41:35 +0000927 return untokenize(result).decode('utf-8')
928
929
930class TestTokenizerAdheresToPep0263(TestCase):
931 """
932 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
933 """
934
935 def _testFile(self, filename):
936 path = os.path.join(os.path.dirname(__file__), filename)
937 return roundtrip(open(path, 'rb'))
938
939 def test_utf8_coding_cookie_and_no_utf8_bom(self):
Ned Deily2ea6fcc2011-07-19 16:15:27 -0700940 f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
Trent Nelson428de652008-03-18 22:41:35 +0000941 self.assertTrue(self._testFile(f))
942
943 def test_latin1_coding_cookie_and_utf8_bom(self):
944 """
945 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
946 allowed encoding for the comment is 'utf-8'. The text file used in
947 this test starts with a BOM signature, but specifies latin1 as the
948 coding, so verify that a SyntaxError is raised, which matches the
949 behaviour of the interpreter when it encounters a similar condition.
950 """
951 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000952 self.assertRaises(SyntaxError, self._testFile, f)
Trent Nelson428de652008-03-18 22:41:35 +0000953
954 def test_no_coding_cookie_and_utf8_bom(self):
955 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
956 self.assertTrue(self._testFile(f))
957
958 def test_utf8_coding_cookie_and_utf8_bom(self):
959 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
960 self.assertTrue(self._testFile(f))
961
Florent Xicluna11f0b412012-07-07 12:13:35 +0200962 def test_bad_coding_cookie(self):
963 self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')
964 self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')
965
Trent Nelson428de652008-03-18 22:41:35 +0000966
967class Test_Tokenize(TestCase):
968
969 def test__tokenize_decodes_with_specified_encoding(self):
970 literal = '"ЉЊЈЁЂ"'
971 line = literal.encode('utf-8')
972 first = False
973 def readline():
974 nonlocal first
975 if not first:
976 first = True
977 return line
978 else:
979 return b''
980
981 # skip the initial encoding token and the end token
982 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
983 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000984 self.assertEqual(tokens, expected_tokens,
985 "bytes not decoded with encoding")
Trent Nelson428de652008-03-18 22:41:35 +0000986
987 def test__tokenize_does_not_decode_with_encoding_none(self):
988 literal = '"ЉЊЈЁЂ"'
989 first = False
990 def readline():
991 nonlocal first
992 if not first:
993 first = True
994 return literal
995 else:
996 return b''
997
998 # skip the end token
999 tokens = list(_tokenize(readline, encoding=None))[:-1]
1000 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +00001001 self.assertEqual(tokens, expected_tokens,
1002 "string not tokenized when encoding is None")
Trent Nelson428de652008-03-18 22:41:35 +00001003
1004
1005class TestDetectEncoding(TestCase):
1006
1007 def get_readline(self, lines):
1008 index = 0
1009 def readline():
1010 nonlocal index
1011 if index == len(lines):
1012 raise StopIteration
1013 line = lines[index]
1014 index += 1
1015 return line
1016 return readline
1017
1018 def test_no_bom_no_encoding_cookie(self):
1019 lines = (
1020 b'# something\n',
1021 b'print(something)\n',
1022 b'do_something(else)\n'
1023 )
1024 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001025 self.assertEqual(encoding, 'utf-8')
1026 self.assertEqual(consumed_lines, list(lines[:2]))
Trent Nelson428de652008-03-18 22:41:35 +00001027
1028 def test_bom_no_cookie(self):
1029 lines = (
1030 b'\xef\xbb\xbf# something\n',
1031 b'print(something)\n',
1032 b'do_something(else)\n'
1033 )
1034 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001035 self.assertEqual(encoding, 'utf-8-sig')
1036 self.assertEqual(consumed_lines,
1037 [b'# something\n', b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001038
1039 def test_cookie_first_line_no_bom(self):
1040 lines = (
1041 b'# -*- coding: latin-1 -*-\n',
1042 b'print(something)\n',
1043 b'do_something(else)\n'
1044 )
1045 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001046 self.assertEqual(encoding, 'iso-8859-1')
1047 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001048
1049 def test_matched_bom_and_cookie_first_line(self):
1050 lines = (
1051 b'\xef\xbb\xbf# coding=utf-8\n',
1052 b'print(something)\n',
1053 b'do_something(else)\n'
1054 )
1055 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001056 self.assertEqual(encoding, 'utf-8-sig')
1057 self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001058
1059 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
1060 lines = (
1061 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
1062 b'print(something)\n',
1063 b'do_something(else)\n'
1064 )
1065 readline = self.get_readline(lines)
1066 self.assertRaises(SyntaxError, detect_encoding, readline)
1067
1068 def test_cookie_second_line_no_bom(self):
1069 lines = (
1070 b'#! something\n',
1071 b'# vim: set fileencoding=ascii :\n',
1072 b'print(something)\n',
1073 b'do_something(else)\n'
1074 )
1075 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001076 self.assertEqual(encoding, 'ascii')
Trent Nelson428de652008-03-18 22:41:35 +00001077 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
Ezio Melottib3aedd42010-11-20 19:04:17 +00001078 self.assertEqual(consumed_lines, expected)
Trent Nelson428de652008-03-18 22:41:35 +00001079
1080 def test_matched_bom_and_cookie_second_line(self):
1081 lines = (
1082 b'\xef\xbb\xbf#! something\n',
1083 b'f# coding=utf-8\n',
1084 b'print(something)\n',
1085 b'do_something(else)\n'
1086 )
1087 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001088 self.assertEqual(encoding, 'utf-8-sig')
1089 self.assertEqual(consumed_lines,
1090 [b'#! something\n', b'f# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001091
1092 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
1093 lines = (
1094 b'\xef\xbb\xbf#! something\n',
1095 b'# vim: set fileencoding=ascii :\n',
1096 b'print(something)\n',
1097 b'do_something(else)\n'
1098 )
1099 readline = self.get_readline(lines)
1100 self.assertRaises(SyntaxError, detect_encoding, readline)
1101
Serhiy Storchaka768c16c2014-01-09 18:36:09 +02001102 def test_cookie_second_line_noncommented_first_line(self):
1103 lines = (
1104 b"print('\xc2\xa3')\n",
1105 b'# vim: set fileencoding=iso8859-15 :\n',
1106 b"print('\xe2\x82\xac')\n"
1107 )
1108 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1109 self.assertEqual(encoding, 'utf-8')
1110 expected = [b"print('\xc2\xa3')\n"]
1111 self.assertEqual(consumed_lines, expected)
1112
1113 def test_cookie_second_line_commented_first_line(self):
1114 lines = (
1115 b"#print('\xc2\xa3')\n",
1116 b'# vim: set fileencoding=iso8859-15 :\n',
1117 b"print('\xe2\x82\xac')\n"
1118 )
1119 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1120 self.assertEqual(encoding, 'iso8859-15')
1121 expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
1122 self.assertEqual(consumed_lines, expected)
1123
1124 def test_cookie_second_line_empty_first_line(self):
1125 lines = (
1126 b'\n',
1127 b'# vim: set fileencoding=iso8859-15 :\n',
1128 b"print('\xe2\x82\xac')\n"
1129 )
1130 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1131 self.assertEqual(encoding, 'iso8859-15')
1132 expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
1133 self.assertEqual(consumed_lines, expected)
1134
Benjamin Petersond3afada2009-10-09 21:43:09 +00001135 def test_latin1_normalization(self):
1136 # See get_normal_name() in tokenizer.c.
1137 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
1138 "iso-8859-1-unix", "iso-latin-1-mac")
1139 for encoding in encodings:
1140 for rep in ("-", "_"):
1141 enc = encoding.replace("-", rep)
1142 lines = (b"#!/usr/bin/python\n",
1143 b"# coding: " + enc.encode("ascii") + b"\n",
1144 b"print(things)\n",
1145 b"do_something += 4\n")
1146 rl = self.get_readline(lines)
1147 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001148 self.assertEqual(found, "iso-8859-1")
Benjamin Petersond3afada2009-10-09 21:43:09 +00001149
Martin v. Löwis63674f42012-04-20 14:36:47 +02001150 def test_syntaxerror_latin1(self):
1151 # Issue 14629: need to raise SyntaxError if the first
1152 # line(s) have non-UTF-8 characters
1153 lines = (
1154 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1155 )
1156 readline = self.get_readline(lines)
1157 self.assertRaises(SyntaxError, detect_encoding, readline)
1158
1159
Benjamin Petersond3afada2009-10-09 21:43:09 +00001160 def test_utf8_normalization(self):
1161 # See get_normal_name() in tokenizer.c.
1162 encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
1163 for encoding in encodings:
1164 for rep in ("-", "_"):
1165 enc = encoding.replace("-", rep)
1166 lines = (b"#!/usr/bin/python\n",
1167 b"# coding: " + enc.encode("ascii") + b"\n",
1168 b"1 + 3\n")
1169 rl = self.get_readline(lines)
1170 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001171 self.assertEqual(found, "utf-8")
Benjamin Petersond3afada2009-10-09 21:43:09 +00001172
Trent Nelson428de652008-03-18 22:41:35 +00001173 def test_short_files(self):
1174 readline = self.get_readline((b'print(something)\n',))
1175 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001176 self.assertEqual(encoding, 'utf-8')
1177 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001178
1179 encoding, consumed_lines = detect_encoding(self.get_readline(()))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001180 self.assertEqual(encoding, 'utf-8')
1181 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +00001182
1183 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
1184 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001185 self.assertEqual(encoding, 'utf-8-sig')
1186 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001187
1188 readline = self.get_readline((b'\xef\xbb\xbf',))
1189 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001190 self.assertEqual(encoding, 'utf-8-sig')
1191 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +00001192
Benjamin Peterson433f32c2008-12-12 01:25:05 +00001193 readline = self.get_readline((b'# coding: bad\n',))
1194 self.assertRaises(SyntaxError, detect_encoding, readline)
Trent Nelson428de652008-03-18 22:41:35 +00001195
Serhiy Storchakadafea852013-09-16 23:51:56 +03001196 def test_false_encoding(self):
1197 # Issue 18873: "Encoding" detected in non-comment lines
1198 readline = self.get_readline((b'print("#coding=fake")',))
1199 encoding, consumed_lines = detect_encoding(readline)
1200 self.assertEqual(encoding, 'utf-8')
1201 self.assertEqual(consumed_lines, [b'print("#coding=fake")'])
1202
Victor Stinner58c07522010-11-09 01:08:59 +00001203 def test_open(self):
1204 filename = support.TESTFN + '.py'
1205 self.addCleanup(support.unlink, filename)
1206
1207 # test coding cookie
1208 for encoding in ('iso-8859-15', 'utf-8'):
1209 with open(filename, 'w', encoding=encoding) as fp:
1210 print("# coding: %s" % encoding, file=fp)
1211 print("print('euro:\u20ac')", file=fp)
1212 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001213 self.assertEqual(fp.encoding, encoding)
1214 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001215
1216 # test BOM (no coding cookie)
1217 with open(filename, 'w', encoding='utf-8-sig') as fp:
1218 print("print('euro:\u20ac')", file=fp)
1219 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001220 self.assertEqual(fp.encoding, 'utf-8-sig')
1221 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001222
Brett Cannonc33f3f22012-04-20 13:23:54 -04001223 def test_filename_in_exception(self):
1224 # When possible, include the file name in the exception.
1225 path = 'some_file_path'
1226 lines = (
1227 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1228 )
1229 class Bunk:
1230 def __init__(self, lines, path):
1231 self.name = path
1232 self._lines = lines
1233 self._index = 0
1234
1235 def readline(self):
1236 if self._index == len(lines):
1237 raise StopIteration
1238 line = lines[self._index]
1239 self._index += 1
1240 return line
1241
1242 with self.assertRaises(SyntaxError):
1243 ins = Bunk(lines, path)
1244 # Make sure lacking a name isn't an issue.
1245 del ins.name
1246 detect_encoding(ins.readline)
1247 with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
1248 ins = Bunk(lines, path)
1249 detect_encoding(ins.readline)
1250
Victor Stinner387729e2015-05-26 00:43:58 +02001251 def test_open_error(self):
1252 # Issue #23840: open() must close the binary file on error
1253 m = BytesIO(b'#coding:xxx')
1254 with mock.patch('tokenize._builtin_open', return_value=m):
1255 self.assertRaises(SyntaxError, tokenize_open, 'foobar')
1256 self.assertTrue(m.closed)
1257
1258
Brett Cannonc33f3f22012-04-20 13:23:54 -04001259
Trent Nelson428de652008-03-18 22:41:35 +00001260class TestTokenize(TestCase):
1261
1262 def test_tokenize(self):
1263 import tokenize as tokenize_module
1264 encoding = object()
1265 encoding_used = None
1266 def mock_detect_encoding(readline):
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001267 return encoding, [b'first', b'second']
Trent Nelson428de652008-03-18 22:41:35 +00001268
1269 def mock__tokenize(readline, encoding):
1270 nonlocal encoding_used
1271 encoding_used = encoding
1272 out = []
1273 while True:
1274 next_line = readline()
1275 if next_line:
1276 out.append(next_line)
1277 continue
1278 return out
1279
1280 counter = 0
1281 def mock_readline():
1282 nonlocal counter
1283 counter += 1
1284 if counter == 5:
1285 return b''
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001286 return str(counter).encode()
Trent Nelson428de652008-03-18 22:41:35 +00001287
1288 orig_detect_encoding = tokenize_module.detect_encoding
1289 orig__tokenize = tokenize_module._tokenize
1290 tokenize_module.detect_encoding = mock_detect_encoding
1291 tokenize_module._tokenize = mock__tokenize
1292 try:
1293 results = tokenize(mock_readline)
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001294 self.assertEqual(list(results),
1295 [b'first', b'second', b'1', b'2', b'3', b'4'])
Trent Nelson428de652008-03-18 22:41:35 +00001296 finally:
1297 tokenize_module.detect_encoding = orig_detect_encoding
1298 tokenize_module._tokenize = orig__tokenize
1299
1300 self.assertTrue(encoding_used, encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +00001301
Yury Selivanov8085b802015-05-18 12:50:52 -04001302 def test_oneline_defs(self):
1303 buf = []
1304 for i in range(500):
1305 buf.append('def i{i}(): return {i}'.format(i=i))
1306 buf.append('OK')
1307 buf = '\n'.join(buf)
1308
1309 # Test that 500 consequent, one-line defs is OK
1310 toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
1311 self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER
1312
Meador Inge00c7f852012-01-19 00:44:45 -06001313 def assertExactTypeEqual(self, opstr, *optypes):
1314 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
1315 num_optypes = len(optypes)
1316 self.assertEqual(len(tokens), 2 + num_optypes)
1317 self.assertEqual(token.tok_name[tokens[0].exact_type],
1318 token.tok_name[ENCODING])
1319 for i in range(num_optypes):
1320 self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
1321 token.tok_name[optypes[i]])
1322 self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
1323 token.tok_name[token.ENDMARKER])
1324
1325 def test_exact_type(self):
1326 self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
1327 self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
1328 self.assertExactTypeEqual(':', token.COLON)
1329 self.assertExactTypeEqual(',', token.COMMA)
1330 self.assertExactTypeEqual(';', token.SEMI)
1331 self.assertExactTypeEqual('+', token.PLUS)
1332 self.assertExactTypeEqual('-', token.MINUS)
1333 self.assertExactTypeEqual('*', token.STAR)
1334 self.assertExactTypeEqual('/', token.SLASH)
1335 self.assertExactTypeEqual('|', token.VBAR)
1336 self.assertExactTypeEqual('&', token.AMPER)
1337 self.assertExactTypeEqual('<', token.LESS)
1338 self.assertExactTypeEqual('>', token.GREATER)
1339 self.assertExactTypeEqual('=', token.EQUAL)
1340 self.assertExactTypeEqual('.', token.DOT)
1341 self.assertExactTypeEqual('%', token.PERCENT)
1342 self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
1343 self.assertExactTypeEqual('==', token.EQEQUAL)
1344 self.assertExactTypeEqual('!=', token.NOTEQUAL)
1345 self.assertExactTypeEqual('<=', token.LESSEQUAL)
1346 self.assertExactTypeEqual('>=', token.GREATEREQUAL)
1347 self.assertExactTypeEqual('~', token.TILDE)
1348 self.assertExactTypeEqual('^', token.CIRCUMFLEX)
1349 self.assertExactTypeEqual('<<', token.LEFTSHIFT)
1350 self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
1351 self.assertExactTypeEqual('**', token.DOUBLESTAR)
1352 self.assertExactTypeEqual('+=', token.PLUSEQUAL)
1353 self.assertExactTypeEqual('-=', token.MINEQUAL)
1354 self.assertExactTypeEqual('*=', token.STAREQUAL)
1355 self.assertExactTypeEqual('/=', token.SLASHEQUAL)
1356 self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
1357 self.assertExactTypeEqual('&=', token.AMPEREQUAL)
1358 self.assertExactTypeEqual('|=', token.VBAREQUAL)
1359 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1360 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1361 self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
1362 self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
1363 self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
1364 self.assertExactTypeEqual('//', token.DOUBLESLASH)
1365 self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
1366 self.assertExactTypeEqual('@', token.AT)
Benjamin Petersond51374e2014-04-09 23:55:56 -04001367 self.assertExactTypeEqual('@=', token.ATEQUAL)
Meador Inge00c7f852012-01-19 00:44:45 -06001368
1369 self.assertExactTypeEqual('a**2+b**2==c**2',
1370 NAME, token.DOUBLESTAR, NUMBER,
1371 token.PLUS,
1372 NAME, token.DOUBLESTAR, NUMBER,
1373 token.EQEQUAL,
1374 NAME, token.DOUBLESTAR, NUMBER)
1375 self.assertExactTypeEqual('{1, 2, 3}',
1376 token.LBRACE,
1377 token.NUMBER, token.COMMA,
1378 token.NUMBER, token.COMMA,
1379 token.NUMBER,
1380 token.RBRACE)
1381 self.assertExactTypeEqual('^(x & 0x1)',
1382 token.CIRCUMFLEX,
1383 token.LPAR,
1384 token.NAME, token.AMPER, token.NUMBER,
1385 token.RPAR)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001386
Ezio Melottifafa8b72012-11-03 17:46:51 +02001387 def test_pathological_trailing_whitespace(self):
1388 # See http://bugs.python.org/issue16152
1389 self.assertExactTypeEqual('@ ', token.AT)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001390
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001391class UntokenizeTest(TestCase):
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001392
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001393 def test_bad_input_order(self):
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001394 # raise if previous row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001395 u = Untokenizer()
1396 u.prev_row = 2
1397 u.prev_col = 2
1398 with self.assertRaises(ValueError) as cm:
1399 u.add_whitespace((1,3))
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001400 self.assertEqual(cm.exception.args[0],
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001401 'start (1,3) precedes previous end (2,2)')
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001402 # raise if previous column in row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001403 self.assertRaises(ValueError, u.add_whitespace, (2,1))
1404
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001405 def test_backslash_continuation(self):
1406 # The problem is that <whitespace>\<newline> leaves no token
1407 u = Untokenizer()
1408 u.prev_row = 1
1409 u.prev_col = 1
1410 u.tokens = []
1411 u.add_whitespace((2, 0))
1412 self.assertEqual(u.tokens, ['\\\n'])
1413 u.prev_row = 2
1414 u.add_whitespace((4, 4))
1415 self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' '])
1416 self.assertTrue(roundtrip('a\n b\n c\n \\\n c\n'))
1417
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -05001418 def test_iter_compat(self):
1419 u = Untokenizer()
1420 token = (NAME, 'Hello')
1421 tokens = [(ENCODING, 'utf-8'), token]
1422 u.compat(token, iter([]))
1423 self.assertEqual(u.tokens, ["Hello "])
1424 u = Untokenizer()
1425 self.assertEqual(u.untokenize(iter([token])), 'Hello ')
1426 u = Untokenizer()
1427 self.assertEqual(u.untokenize(iter(tokens)), 'Hello ')
1428 self.assertEqual(u.encoding, 'utf-8')
1429 self.assertEqual(untokenize(iter(tokens)), b'Hello ')
1430
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001431
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001432class TestRoundtrip(TestCase):
1433 def roundtrip(self, code):
1434 if isinstance(code, str):
1435 code = code.encode('utf-8')
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001436 return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001437
1438 def test_indentation_semantics_retained(self):
1439 """
1440 Ensure that although whitespace might be mutated in a roundtrip,
1441 the semantic meaning of the indentation remains consistent.
1442 """
1443 code = "if False:\n\tx=3\n\tx=3\n"
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001444 codelines = self.roundtrip(code).split('\n')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001445 self.assertEqual(codelines[1], codelines[2])
1446
1447
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001448__test__ = {"doctests" : doctests, 'decistmt': decistmt}
1449
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001450def test_main():
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001451 from test import test_tokenize
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001452 support.run_doctest(test_tokenize, True)
1453 support.run_unittest(TestTokenizerAdheresToPep0263)
1454 support.run_unittest(Test_Tokenize)
1455 support.run_unittest(TestDetectEncoding)
1456 support.run_unittest(TestTokenize)
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001457 support.run_unittest(UntokenizeTest)
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001458 support.run_unittest(TestRoundtrip)
Neal Norwitzc1505362006-12-28 06:47:50 +00001459
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001460if __name__ == "__main__":
1461 test_main()