blob: b7ca08949a3aa23dfe6f342d2cd35994ba57a04b [file] [log] [blame]
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001doctests = """
2Tests for the tokenize module.
Thomas Wouters89f507f2006-12-13 04:49:30 +00003
Christian Heimesdd15f6c2008-03-16 00:07:10 +00004The tests can be really simple. Given a small fragment of source
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05005code, print out a table with tokens. The ENDMARKER is omitted for
Thomas Wouters89f507f2006-12-13 04:49:30 +00006brevity.
7
Jason R. Coombs7cf36382015-06-20 19:13:50 -04008 >>> import glob
9
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010 >>> dump_tokens("1 + 1")
Trent Nelson428de652008-03-18 22:41:35 +000011 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000012 NUMBER '1' (1, 0) (1, 1)
13 OP '+' (1, 2) (1, 3)
14 NUMBER '1' (1, 4) (1, 5)
Thomas Wouters89f507f2006-12-13 04:49:30 +000015
Christian Heimesdd15f6c2008-03-16 00:07:10 +000016 >>> dump_tokens("if False:\\n"
17 ... " # NL\\n"
18 ... " True = False # NEWLINE\\n")
Trent Nelson428de652008-03-18 22:41:35 +000019 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000020 NAME 'if' (1, 0) (1, 2)
21 NAME 'False' (1, 3) (1, 8)
22 OP ':' (1, 8) (1, 9)
23 NEWLINE '\\n' (1, 9) (1, 10)
24 COMMENT '# NL' (2, 4) (2, 8)
25 NL '\\n' (2, 8) (2, 9)
26 INDENT ' ' (3, 0) (3, 4)
27 NAME 'True' (3, 4) (3, 8)
28 OP '=' (3, 9) (3, 10)
29 NAME 'False' (3, 11) (3, 16)
30 COMMENT '# NEWLINE' (3, 17) (3, 26)
31 NEWLINE '\\n' (3, 26) (3, 27)
32 DEDENT '' (4, 0) (4, 0)
Thomas Wouters89f507f2006-12-13 04:49:30 +000033
Christian Heimesdd15f6c2008-03-16 00:07:10 +000034 >>> indent_error_file = \"""
35 ... def k(x):
36 ... x += 2
37 ... x += 5
38 ... \"""
Trent Nelson428de652008-03-18 22:41:35 +000039 >>> readline = BytesIO(indent_error_file.encode('utf-8')).readline
40 >>> for tok in tokenize(readline): pass
Christian Heimesdd15f6c2008-03-16 00:07:10 +000041 Traceback (most recent call last):
42 ...
43 IndentationError: unindent does not match any outer indentation level
Thomas Wouters89f507f2006-12-13 04:49:30 +000044
Mark Dickinson3c0b3172010-06-29 07:38:37 +000045There are some standard formatting practices that are easy to get right.
Thomas Wouters89f507f2006-12-13 04:49:30 +000046
Christian Heimesdd15f6c2008-03-16 00:07:10 +000047 >>> roundtrip("if x == 1:\\n"
48 ... " print(x)\\n")
49 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000050
Christian Heimesdd15f6c2008-03-16 00:07:10 +000051 >>> roundtrip("# This is a comment\\n# This also")
52 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000053
54Some people use different formatting conventions, which makes
Christian Heimesdd15f6c2008-03-16 00:07:10 +000055untokenize a little trickier. Note that this test involves trailing
56whitespace after the colon. Note that we use hex escapes to make the
Trent Nelson428de652008-03-18 22:41:35 +000057two trailing blanks apparent in the expected output.
Thomas Wouters89f507f2006-12-13 04:49:30 +000058
Christian Heimesdd15f6c2008-03-16 00:07:10 +000059 >>> roundtrip("if x == 1 : \\n"
60 ... " print(x)\\n")
61 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000062
Benjamin Petersonee8712c2008-05-20 21:35:26 +000063 >>> f = support.findfile("tokenize_tests.txt")
Trent Nelson428de652008-03-18 22:41:35 +000064 >>> roundtrip(open(f, 'rb'))
Christian Heimesdd15f6c2008-03-16 00:07:10 +000065 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000066
Christian Heimesdd15f6c2008-03-16 00:07:10 +000067 >>> roundtrip("if x == 1:\\n"
68 ... " # A comment by itself.\\n"
69 ... " print(x) # Comment here, too.\\n"
70 ... " # Another comment.\\n"
71 ... "after_if = True\\n")
72 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000073
Christian Heimesdd15f6c2008-03-16 00:07:10 +000074 >>> roundtrip("if (x # The comments need to go in the right place\\n"
75 ... " == 1):\\n"
76 ... " print('x==1')\\n")
77 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000078
Christian Heimesdd15f6c2008-03-16 00:07:10 +000079 >>> roundtrip("class Test: # A comment here\\n"
80 ... " # A comment with weird indent\\n"
81 ... " after_com = 5\\n"
82 ... " def x(m): return m*5 # a one liner\\n"
83 ... " def y(m): # A whitespace after the colon\\n"
84 ... " return y*4 # 3-space indent\\n")
85 True
86
87Some error-handling code
88
89 >>> roundtrip("try: import somemodule\\n"
90 ... "except ImportError: # comment\\n"
Christian Heimesba4af492008-03-28 00:55:15 +000091 ... " print('Can not import' # comment2\\n)"
Neal Norwitz752abd02008-05-13 04:55:24 +000092 ... "else: print('Loaded')\\n")
Christian Heimesdd15f6c2008-03-16 00:07:10 +000093 True
94
Eric Smith74ca5572008-03-17 19:49:19 +000095Balancing continuation
Christian Heimesdd15f6c2008-03-16 00:07:10 +000096
97 >>> roundtrip("a = (3,4, \\n"
98 ... "5,6)\\n"
99 ... "y = [3, 4,\\n"
100 ... "5]\\n"
101 ... "z = {'a': 5,\\n"
102 ... "'b':15, 'c':True}\\n"
103 ... "x = len(y) + 5 - a[\\n"
104 ... "3] - a[2]\\n"
105 ... "+ len(z) - z[\\n"
106 ... "'b']\\n")
107 True
108
109Ordinary integers and binary operators
110
111 >>> dump_tokens("0xff <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000112 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000113 NUMBER '0xff' (1, 0) (1, 4)
114 OP '<=' (1, 5) (1, 7)
115 NUMBER '255' (1, 8) (1, 11)
Eric Smith74ca5572008-03-17 19:49:19 +0000116 >>> dump_tokens("0b10 <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000117 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000118 NUMBER '0b10' (1, 0) (1, 4)
119 OP '<=' (1, 5) (1, 7)
120 NUMBER '255' (1, 8) (1, 11)
121 >>> dump_tokens("0o123 <= 0O123")
Trent Nelson428de652008-03-18 22:41:35 +0000122 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000123 NUMBER '0o123' (1, 0) (1, 5)
124 OP '<=' (1, 6) (1, 8)
125 NUMBER '0O123' (1, 9) (1, 14)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000126 >>> dump_tokens("1234567 > ~0x15")
Trent Nelson428de652008-03-18 22:41:35 +0000127 ENCODING 'utf-8' (0, 0) (0, 0)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000128 NUMBER '1234567' (1, 0) (1, 7)
129 OP '>' (1, 8) (1, 9)
130 OP '~' (1, 10) (1, 11)
131 NUMBER '0x15' (1, 11) (1, 15)
132 >>> dump_tokens("2134568 != 1231515")
Trent Nelson428de652008-03-18 22:41:35 +0000133 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000134 NUMBER '2134568' (1, 0) (1, 7)
135 OP '!=' (1, 8) (1, 10)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000136 NUMBER '1231515' (1, 11) (1, 18)
137 >>> dump_tokens("(-124561-1) & 200000000")
Trent Nelson428de652008-03-18 22:41:35 +0000138 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000139 OP '(' (1, 0) (1, 1)
140 OP '-' (1, 1) (1, 2)
141 NUMBER '124561' (1, 2) (1, 8)
142 OP '-' (1, 8) (1, 9)
143 NUMBER '1' (1, 9) (1, 10)
144 OP ')' (1, 10) (1, 11)
145 OP '&' (1, 12) (1, 13)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000146 NUMBER '200000000' (1, 14) (1, 23)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000147 >>> dump_tokens("0xdeadbeef != -1")
Trent Nelson428de652008-03-18 22:41:35 +0000148 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000149 NUMBER '0xdeadbeef' (1, 0) (1, 10)
150 OP '!=' (1, 11) (1, 13)
151 OP '-' (1, 14) (1, 15)
152 NUMBER '1' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000153 >>> dump_tokens("0xdeadc0de & 12345")
Trent Nelson428de652008-03-18 22:41:35 +0000154 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000155 NUMBER '0xdeadc0de' (1, 0) (1, 10)
156 OP '&' (1, 11) (1, 12)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000157 NUMBER '12345' (1, 13) (1, 18)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000158 >>> dump_tokens("0xFF & 0x15 | 1234")
Trent Nelson428de652008-03-18 22:41:35 +0000159 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000160 NUMBER '0xFF' (1, 0) (1, 4)
161 OP '&' (1, 5) (1, 6)
162 NUMBER '0x15' (1, 7) (1, 11)
163 OP '|' (1, 12) (1, 13)
164 NUMBER '1234' (1, 14) (1, 18)
165
166Long integers
167
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000168 >>> dump_tokens("x = 0")
Trent Nelson428de652008-03-18 22:41:35 +0000169 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000170 NAME 'x' (1, 0) (1, 1)
171 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000172 NUMBER '0' (1, 4) (1, 5)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000173 >>> dump_tokens("x = 0xfffffffffff")
Trent Nelson428de652008-03-18 22:41:35 +0000174 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000175 NAME 'x' (1, 0) (1, 1)
176 OP '=' (1, 2) (1, 3)
177 NUMBER '0xffffffffff (1, 4) (1, 17)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000178 >>> dump_tokens("x = 123141242151251616110")
Trent Nelson428de652008-03-18 22:41:35 +0000179 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000180 NAME 'x' (1, 0) (1, 1)
181 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000182 NUMBER '123141242151 (1, 4) (1, 25)
183 >>> dump_tokens("x = -15921590215012591")
Trent Nelson428de652008-03-18 22:41:35 +0000184 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000185 NAME 'x' (1, 0) (1, 1)
186 OP '=' (1, 2) (1, 3)
187 OP '-' (1, 4) (1, 5)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000188 NUMBER '159215902150 (1, 5) (1, 22)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000189
190Floating point numbers
191
192 >>> dump_tokens("x = 3.14159")
Trent Nelson428de652008-03-18 22:41:35 +0000193 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000194 NAME 'x' (1, 0) (1, 1)
195 OP '=' (1, 2) (1, 3)
196 NUMBER '3.14159' (1, 4) (1, 11)
197 >>> dump_tokens("x = 314159.")
Trent Nelson428de652008-03-18 22:41:35 +0000198 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000199 NAME 'x' (1, 0) (1, 1)
200 OP '=' (1, 2) (1, 3)
201 NUMBER '314159.' (1, 4) (1, 11)
202 >>> dump_tokens("x = .314159")
Trent Nelson428de652008-03-18 22:41:35 +0000203 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000204 NAME 'x' (1, 0) (1, 1)
205 OP '=' (1, 2) (1, 3)
206 NUMBER '.314159' (1, 4) (1, 11)
207 >>> dump_tokens("x = 3e14159")
Trent Nelson428de652008-03-18 22:41:35 +0000208 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000209 NAME 'x' (1, 0) (1, 1)
210 OP '=' (1, 2) (1, 3)
211 NUMBER '3e14159' (1, 4) (1, 11)
212 >>> dump_tokens("x = 3E123")
Trent Nelson428de652008-03-18 22:41:35 +0000213 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000214 NAME 'x' (1, 0) (1, 1)
215 OP '=' (1, 2) (1, 3)
216 NUMBER '3E123' (1, 4) (1, 9)
217 >>> dump_tokens("x+y = 3e-1230")
Trent Nelson428de652008-03-18 22:41:35 +0000218 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000219 NAME 'x' (1, 0) (1, 1)
220 OP '+' (1, 1) (1, 2)
221 NAME 'y' (1, 2) (1, 3)
222 OP '=' (1, 4) (1, 5)
223 NUMBER '3e-1230' (1, 6) (1, 13)
224 >>> dump_tokens("x = 3.14e159")
Trent Nelson428de652008-03-18 22:41:35 +0000225 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000226 NAME 'x' (1, 0) (1, 1)
227 OP '=' (1, 2) (1, 3)
228 NUMBER '3.14e159' (1, 4) (1, 12)
229
230String literals
231
232 >>> dump_tokens("x = ''; y = \\\"\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000233 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000234 NAME 'x' (1, 0) (1, 1)
235 OP '=' (1, 2) (1, 3)
236 STRING "''" (1, 4) (1, 6)
237 OP ';' (1, 6) (1, 7)
238 NAME 'y' (1, 8) (1, 9)
239 OP '=' (1, 10) (1, 11)
240 STRING '""' (1, 12) (1, 14)
241 >>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000242 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000243 NAME 'x' (1, 0) (1, 1)
244 OP '=' (1, 2) (1, 3)
245 STRING '\\'"\\'' (1, 4) (1, 7)
246 OP ';' (1, 7) (1, 8)
247 NAME 'y' (1, 9) (1, 10)
248 OP '=' (1, 11) (1, 12)
249 STRING '"\\'"' (1, 13) (1, 16)
250 >>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000251 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000252 NAME 'x' (1, 0) (1, 1)
253 OP '=' (1, 2) (1, 3)
254 STRING '"doesn\\'t "' (1, 4) (1, 14)
255 NAME 'shrink' (1, 14) (1, 20)
256 STRING '", does it"' (1, 20) (1, 31)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000257 >>> dump_tokens("x = 'abc' + 'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000258 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000259 NAME 'x' (1, 0) (1, 1)
260 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000261 STRING "'abc'" (1, 4) (1, 9)
262 OP '+' (1, 10) (1, 11)
263 STRING "'ABC'" (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000264 >>> dump_tokens('y = "ABC" + "ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000265 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000266 NAME 'y' (1, 0) (1, 1)
267 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000268 STRING '"ABC"' (1, 4) (1, 9)
269 OP '+' (1, 10) (1, 11)
270 STRING '"ABC"' (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000271 >>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000272 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000273 NAME 'x' (1, 0) (1, 1)
274 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000275 STRING "r'abc'" (1, 4) (1, 10)
276 OP '+' (1, 11) (1, 12)
277 STRING "r'ABC'" (1, 13) (1, 19)
278 OP '+' (1, 20) (1, 21)
279 STRING "R'ABC'" (1, 22) (1, 28)
280 OP '+' (1, 29) (1, 30)
281 STRING "R'ABC'" (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000282 >>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000283 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000284 NAME 'y' (1, 0) (1, 1)
285 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000286 STRING 'r"abc"' (1, 4) (1, 10)
287 OP '+' (1, 11) (1, 12)
288 STRING 'r"ABC"' (1, 13) (1, 19)
289 OP '+' (1, 20) (1, 21)
290 STRING 'R"ABC"' (1, 22) (1, 28)
291 OP '+' (1, 29) (1, 30)
292 STRING 'R"ABC"' (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000293
Meador Inge8d5c0b82012-06-16 21:49:08 -0500294 >>> dump_tokens("u'abc' + U'abc'")
295 ENCODING 'utf-8' (0, 0) (0, 0)
296 STRING "u'abc'" (1, 0) (1, 6)
297 OP '+' (1, 7) (1, 8)
298 STRING "U'abc'" (1, 9) (1, 15)
299 >>> dump_tokens('u"abc" + U"abc"')
300 ENCODING 'utf-8' (0, 0) (0, 0)
301 STRING 'u"abc"' (1, 0) (1, 6)
302 OP '+' (1, 7) (1, 8)
303 STRING 'U"abc"' (1, 9) (1, 15)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500304
305 >>> dump_tokens("b'abc' + B'abc'")
306 ENCODING 'utf-8' (0, 0) (0, 0)
307 STRING "b'abc'" (1, 0) (1, 6)
308 OP '+' (1, 7) (1, 8)
309 STRING "B'abc'" (1, 9) (1, 15)
310 >>> dump_tokens('b"abc" + B"abc"')
311 ENCODING 'utf-8' (0, 0) (0, 0)
312 STRING 'b"abc"' (1, 0) (1, 6)
313 OP '+' (1, 7) (1, 8)
314 STRING 'B"abc"' (1, 9) (1, 15)
315 >>> dump_tokens("br'abc' + bR'abc' + Br'abc' + BR'abc'")
316 ENCODING 'utf-8' (0, 0) (0, 0)
317 STRING "br'abc'" (1, 0) (1, 7)
318 OP '+' (1, 8) (1, 9)
319 STRING "bR'abc'" (1, 10) (1, 17)
320 OP '+' (1, 18) (1, 19)
321 STRING "Br'abc'" (1, 20) (1, 27)
322 OP '+' (1, 28) (1, 29)
323 STRING "BR'abc'" (1, 30) (1, 37)
324 >>> dump_tokens('br"abc" + bR"abc" + Br"abc" + BR"abc"')
325 ENCODING 'utf-8' (0, 0) (0, 0)
326 STRING 'br"abc"' (1, 0) (1, 7)
327 OP '+' (1, 8) (1, 9)
328 STRING 'bR"abc"' (1, 10) (1, 17)
329 OP '+' (1, 18) (1, 19)
330 STRING 'Br"abc"' (1, 20) (1, 27)
331 OP '+' (1, 28) (1, 29)
332 STRING 'BR"abc"' (1, 30) (1, 37)
333 >>> dump_tokens("rb'abc' + rB'abc' + Rb'abc' + RB'abc'")
334 ENCODING 'utf-8' (0, 0) (0, 0)
335 STRING "rb'abc'" (1, 0) (1, 7)
336 OP '+' (1, 8) (1, 9)
337 STRING "rB'abc'" (1, 10) (1, 17)
338 OP '+' (1, 18) (1, 19)
339 STRING "Rb'abc'" (1, 20) (1, 27)
340 OP '+' (1, 28) (1, 29)
341 STRING "RB'abc'" (1, 30) (1, 37)
342 >>> dump_tokens('rb"abc" + rB"abc" + Rb"abc" + RB"abc"')
343 ENCODING 'utf-8' (0, 0) (0, 0)
344 STRING 'rb"abc"' (1, 0) (1, 7)
345 OP '+' (1, 8) (1, 9)
346 STRING 'rB"abc"' (1, 10) (1, 17)
347 OP '+' (1, 18) (1, 19)
348 STRING 'Rb"abc"' (1, 20) (1, 27)
349 OP '+' (1, 28) (1, 29)
350 STRING 'RB"abc"' (1, 30) (1, 37)
351
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000352Operators
353
354 >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000355 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000356 NAME 'def' (1, 0) (1, 3)
357 NAME 'd22' (1, 4) (1, 7)
358 OP '(' (1, 7) (1, 8)
359 NAME 'a' (1, 8) (1, 9)
360 OP ',' (1, 9) (1, 10)
361 NAME 'b' (1, 11) (1, 12)
362 OP ',' (1, 12) (1, 13)
363 NAME 'c' (1, 14) (1, 15)
364 OP '=' (1, 15) (1, 16)
365 NUMBER '2' (1, 16) (1, 17)
366 OP ',' (1, 17) (1, 18)
367 NAME 'd' (1, 19) (1, 20)
368 OP '=' (1, 20) (1, 21)
369 NUMBER '2' (1, 21) (1, 22)
370 OP ',' (1, 22) (1, 23)
371 OP '*' (1, 24) (1, 25)
372 NAME 'k' (1, 25) (1, 26)
373 OP ')' (1, 26) (1, 27)
374 OP ':' (1, 27) (1, 28)
375 NAME 'pass' (1, 29) (1, 33)
376 >>> dump_tokens("def d01v_(a=1, *k, **w): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000377 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000378 NAME 'def' (1, 0) (1, 3)
379 NAME 'd01v_' (1, 4) (1, 9)
380 OP '(' (1, 9) (1, 10)
381 NAME 'a' (1, 10) (1, 11)
382 OP '=' (1, 11) (1, 12)
383 NUMBER '1' (1, 12) (1, 13)
384 OP ',' (1, 13) (1, 14)
385 OP '*' (1, 15) (1, 16)
386 NAME 'k' (1, 16) (1, 17)
387 OP ',' (1, 17) (1, 18)
388 OP '**' (1, 19) (1, 21)
389 NAME 'w' (1, 21) (1, 22)
390 OP ')' (1, 22) (1, 23)
391 OP ':' (1, 23) (1, 24)
392 NAME 'pass' (1, 25) (1, 29)
393
394Comparison
395
396 >>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +
397 ... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")
Trent Nelson428de652008-03-18 22:41:35 +0000398 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000399 NAME 'if' (1, 0) (1, 2)
400 NUMBER '1' (1, 3) (1, 4)
401 OP '<' (1, 5) (1, 6)
402 NUMBER '1' (1, 7) (1, 8)
403 OP '>' (1, 9) (1, 10)
404 NUMBER '1' (1, 11) (1, 12)
405 OP '==' (1, 13) (1, 15)
406 NUMBER '1' (1, 16) (1, 17)
407 OP '>=' (1, 18) (1, 20)
408 NUMBER '5' (1, 21) (1, 22)
409 OP '<=' (1, 23) (1, 25)
410 NUMBER '0x15' (1, 26) (1, 30)
411 OP '<=' (1, 31) (1, 33)
412 NUMBER '0x12' (1, 34) (1, 38)
413 OP '!=' (1, 39) (1, 41)
414 NUMBER '1' (1, 42) (1, 43)
415 NAME 'and' (1, 44) (1, 47)
416 NUMBER '5' (1, 48) (1, 49)
417 NAME 'in' (1, 50) (1, 52)
418 NUMBER '1' (1, 53) (1, 54)
419 NAME 'not' (1, 55) (1, 58)
420 NAME 'in' (1, 59) (1, 61)
421 NUMBER '1' (1, 62) (1, 63)
422 NAME 'is' (1, 64) (1, 66)
423 NUMBER '1' (1, 67) (1, 68)
424 NAME 'or' (1, 69) (1, 71)
425 NUMBER '5' (1, 72) (1, 73)
426 NAME 'is' (1, 74) (1, 76)
427 NAME 'not' (1, 77) (1, 80)
428 NUMBER '1' (1, 81) (1, 82)
429 OP ':' (1, 82) (1, 83)
430 NAME 'pass' (1, 84) (1, 88)
431
432Shift
433
434 >>> dump_tokens("x = 1 << 1 >> 5")
Trent Nelson428de652008-03-18 22:41:35 +0000435 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000436 NAME 'x' (1, 0) (1, 1)
437 OP '=' (1, 2) (1, 3)
438 NUMBER '1' (1, 4) (1, 5)
439 OP '<<' (1, 6) (1, 8)
440 NUMBER '1' (1, 9) (1, 10)
441 OP '>>' (1, 11) (1, 13)
442 NUMBER '5' (1, 14) (1, 15)
443
444Additive
445
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000446 >>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")
Trent Nelson428de652008-03-18 22:41:35 +0000447 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000448 NAME 'x' (1, 0) (1, 1)
449 OP '=' (1, 2) (1, 3)
450 NUMBER '1' (1, 4) (1, 5)
451 OP '-' (1, 6) (1, 7)
452 NAME 'y' (1, 8) (1, 9)
453 OP '+' (1, 10) (1, 11)
454 NUMBER '15' (1, 12) (1, 14)
455 OP '-' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000456 NUMBER '1' (1, 17) (1, 18)
457 OP '+' (1, 19) (1, 20)
458 NUMBER '0x124' (1, 21) (1, 26)
459 OP '+' (1, 27) (1, 28)
460 NAME 'z' (1, 29) (1, 30)
461 OP '+' (1, 31) (1, 32)
462 NAME 'a' (1, 33) (1, 34)
463 OP '[' (1, 34) (1, 35)
464 NUMBER '5' (1, 35) (1, 36)
465 OP ']' (1, 36) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000466
467Multiplicative
468
Benjamin Petersond51374e2014-04-09 23:55:56 -0400469 >>> dump_tokens("x = 1//1*1/5*12%0x12@42")
Trent Nelson428de652008-03-18 22:41:35 +0000470 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000471 NAME 'x' (1, 0) (1, 1)
472 OP '=' (1, 2) (1, 3)
473 NUMBER '1' (1, 4) (1, 5)
474 OP '//' (1, 5) (1, 7)
475 NUMBER '1' (1, 7) (1, 8)
476 OP '*' (1, 8) (1, 9)
477 NUMBER '1' (1, 9) (1, 10)
478 OP '/' (1, 10) (1, 11)
479 NUMBER '5' (1, 11) (1, 12)
480 OP '*' (1, 12) (1, 13)
481 NUMBER '12' (1, 13) (1, 15)
482 OP '%' (1, 15) (1, 16)
483 NUMBER '0x12' (1, 16) (1, 20)
Benjamin Petersond51374e2014-04-09 23:55:56 -0400484 OP '@' (1, 20) (1, 21)
485 NUMBER '42' (1, 21) (1, 23)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000486
487Unary
488
489 >>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")
Trent Nelson428de652008-03-18 22:41:35 +0000490 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000491 OP '~' (1, 0) (1, 1)
492 NUMBER '1' (1, 1) (1, 2)
493 OP '^' (1, 3) (1, 4)
494 NUMBER '1' (1, 5) (1, 6)
495 OP '&' (1, 7) (1, 8)
496 NUMBER '1' (1, 9) (1, 10)
497 OP '|' (1, 11) (1, 12)
498 NUMBER '1' (1, 12) (1, 13)
499 OP '^' (1, 14) (1, 15)
500 OP '-' (1, 16) (1, 17)
501 NUMBER '1' (1, 17) (1, 18)
502 >>> dump_tokens("-1*1/1+1*1//1 - ---1**1")
Trent Nelson428de652008-03-18 22:41:35 +0000503 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000504 OP '-' (1, 0) (1, 1)
505 NUMBER '1' (1, 1) (1, 2)
506 OP '*' (1, 2) (1, 3)
507 NUMBER '1' (1, 3) (1, 4)
508 OP '/' (1, 4) (1, 5)
509 NUMBER '1' (1, 5) (1, 6)
510 OP '+' (1, 6) (1, 7)
511 NUMBER '1' (1, 7) (1, 8)
512 OP '*' (1, 8) (1, 9)
513 NUMBER '1' (1, 9) (1, 10)
514 OP '//' (1, 10) (1, 12)
515 NUMBER '1' (1, 12) (1, 13)
516 OP '-' (1, 14) (1, 15)
517 OP '-' (1, 16) (1, 17)
518 OP '-' (1, 17) (1, 18)
519 OP '-' (1, 18) (1, 19)
520 NUMBER '1' (1, 19) (1, 20)
521 OP '**' (1, 20) (1, 22)
522 NUMBER '1' (1, 22) (1, 23)
523
524Selector
525
526 >>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")
Trent Nelson428de652008-03-18 22:41:35 +0000527 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000528 NAME 'import' (1, 0) (1, 6)
529 NAME 'sys' (1, 7) (1, 10)
530 OP ',' (1, 10) (1, 11)
531 NAME 'time' (1, 12) (1, 16)
532 NEWLINE '\\n' (1, 16) (1, 17)
533 NAME 'x' (2, 0) (2, 1)
534 OP '=' (2, 2) (2, 3)
535 NAME 'sys' (2, 4) (2, 7)
536 OP '.' (2, 7) (2, 8)
537 NAME 'modules' (2, 8) (2, 15)
538 OP '[' (2, 15) (2, 16)
539 STRING "'time'" (2, 16) (2, 22)
540 OP ']' (2, 22) (2, 23)
541 OP '.' (2, 23) (2, 24)
542 NAME 'time' (2, 24) (2, 28)
543 OP '(' (2, 28) (2, 29)
544 OP ')' (2, 29) (2, 30)
545
546Methods
547
548 >>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000549 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000550 OP '@' (1, 0) (1, 1)
551 NAME 'staticmethod (1, 1) (1, 13)
552 NEWLINE '\\n' (1, 13) (1, 14)
553 NAME 'def' (2, 0) (2, 3)
554 NAME 'foo' (2, 4) (2, 7)
555 OP '(' (2, 7) (2, 8)
556 NAME 'x' (2, 8) (2, 9)
557 OP ',' (2, 9) (2, 10)
558 NAME 'y' (2, 10) (2, 11)
559 OP ')' (2, 11) (2, 12)
560 OP ':' (2, 12) (2, 13)
561 NAME 'pass' (2, 14) (2, 18)
562
563Backslash means line continuation, except for comments
564
565 >>> roundtrip("x=1+\\\\n"
566 ... "1\\n"
567 ... "# This is a comment\\\\n"
568 ... "# This also\\n")
569 True
570 >>> roundtrip("# Comment \\\\nx = 0")
571 True
Christian Heimesba4af492008-03-28 00:55:15 +0000572
573Two string literals on the same line
574
575 >>> roundtrip("'' ''")
576 True
577
578Test roundtrip on random python modules.
Antoine Pitrou5bc4fa72010-10-14 15:34:31 +0000579pass the '-ucpu' option to process the full directory.
Christian Heimesba4af492008-03-28 00:55:15 +0000580
581 >>> import random
582 >>> tempdir = os.path.dirname(f) or os.curdir
583 >>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
584
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500585Tokenize is broken on test_pep3131.py because regular expressions are
586broken on the obscure unicode identifiers in it. *sigh*
587With roundtrip extended to test the 5-tuple mode of untokenize,
5887 more testfiles fail. Remove them also until the failure is diagnosed.
589
Benjamin Peterson963e4022011-08-13 00:33:21 -0500590 >>> testfiles.remove(os.path.join(tempdir, "test_pep3131.py"))
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500591 >>> for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'):
592 ... testfiles.remove(os.path.join(tempdir, "test_%s.py") % f)
593 ...
Antoine Pitrou5bc4fa72010-10-14 15:34:31 +0000594 >>> if not support.is_resource_enabled("cpu"):
Christian Heimesba4af492008-03-28 00:55:15 +0000595 ... testfiles = random.sample(testfiles, 10)
596 ...
597 >>> for testfile in testfiles:
598 ... if not roundtrip(open(testfile, 'rb')):
599 ... print("Roundtrip failed for file %s" % testfile)
600 ... break
601 ... else: True
602 True
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000603
604Evil tabs
Benjamin Peterson33856de2010-08-30 14:41:20 +0000605
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000606 >>> dump_tokens("def f():\\n\\tif x\\n \\tpass")
607 ENCODING 'utf-8' (0, 0) (0, 0)
608 NAME 'def' (1, 0) (1, 3)
609 NAME 'f' (1, 4) (1, 5)
610 OP '(' (1, 5) (1, 6)
611 OP ')' (1, 6) (1, 7)
612 OP ':' (1, 7) (1, 8)
613 NEWLINE '\\n' (1, 8) (1, 9)
614 INDENT '\\t' (2, 0) (2, 1)
615 NAME 'if' (2, 1) (2, 3)
616 NAME 'x' (2, 4) (2, 5)
617 NEWLINE '\\n' (2, 5) (2, 6)
618 INDENT ' \\t' (3, 0) (3, 9)
619 NAME 'pass' (3, 9) (3, 13)
620 DEDENT '' (4, 0) (4, 0)
621 DEDENT '' (4, 0) (4, 0)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000622
623Non-ascii identifiers
624
625 >>> dump_tokens("Örter = 'places'\\ngrün = 'green'")
626 ENCODING 'utf-8' (0, 0) (0, 0)
627 NAME 'Örter' (1, 0) (1, 5)
628 OP '=' (1, 6) (1, 7)
629 STRING "'places'" (1, 8) (1, 16)
630 NEWLINE '\\n' (1, 16) (1, 17)
631 NAME 'grün' (2, 0) (2, 4)
632 OP '=' (2, 5) (2, 6)
633 STRING "'green'" (2, 7) (2, 14)
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000634
635Legacy unicode literals:
636
Christian Heimes0b3847d2012-06-20 11:17:58 +0200637 >>> dump_tokens("Örter = u'places'\\ngrün = U'green'")
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000638 ENCODING 'utf-8' (0, 0) (0, 0)
639 NAME 'Örter' (1, 0) (1, 5)
640 OP '=' (1, 6) (1, 7)
641 STRING "u'places'" (1, 8) (1, 17)
642 NEWLINE '\\n' (1, 17) (1, 18)
643 NAME 'grün' (2, 0) (2, 4)
644 OP '=' (2, 5) (2, 6)
Christian Heimes0b3847d2012-06-20 11:17:58 +0200645 STRING "U'green'" (2, 7) (2, 15)
Yury Selivanov75445082015-05-11 22:57:16 -0400646
647Async/await extension:
648
649 >>> dump_tokens("async = 1")
650 ENCODING 'utf-8' (0, 0) (0, 0)
651 NAME 'async' (1, 0) (1, 5)
652 OP '=' (1, 6) (1, 7)
653 NUMBER '1' (1, 8) (1, 9)
654
655 >>> dump_tokens("a = (async = 1)")
656 ENCODING 'utf-8' (0, 0) (0, 0)
657 NAME 'a' (1, 0) (1, 1)
658 OP '=' (1, 2) (1, 3)
659 OP '(' (1, 4) (1, 5)
660 NAME 'async' (1, 5) (1, 10)
661 OP '=' (1, 11) (1, 12)
662 NUMBER '1' (1, 13) (1, 14)
663 OP ')' (1, 14) (1, 15)
664
665 >>> dump_tokens("async()")
666 ENCODING 'utf-8' (0, 0) (0, 0)
667 NAME 'async' (1, 0) (1, 5)
668 OP '(' (1, 5) (1, 6)
669 OP ')' (1, 6) (1, 7)
670
671 >>> dump_tokens("class async(Bar):pass")
672 ENCODING 'utf-8' (0, 0) (0, 0)
673 NAME 'class' (1, 0) (1, 5)
674 NAME 'async' (1, 6) (1, 11)
675 OP '(' (1, 11) (1, 12)
676 NAME 'Bar' (1, 12) (1, 15)
677 OP ')' (1, 15) (1, 16)
678 OP ':' (1, 16) (1, 17)
679 NAME 'pass' (1, 17) (1, 21)
680
681 >>> dump_tokens("class async:pass")
682 ENCODING 'utf-8' (0, 0) (0, 0)
683 NAME 'class' (1, 0) (1, 5)
684 NAME 'async' (1, 6) (1, 11)
685 OP ':' (1, 11) (1, 12)
686 NAME 'pass' (1, 12) (1, 16)
687
688 >>> dump_tokens("await = 1")
689 ENCODING 'utf-8' (0, 0) (0, 0)
690 NAME 'await' (1, 0) (1, 5)
691 OP '=' (1, 6) (1, 7)
692 NUMBER '1' (1, 8) (1, 9)
693
694 >>> dump_tokens("foo.async")
695 ENCODING 'utf-8' (0, 0) (0, 0)
696 NAME 'foo' (1, 0) (1, 3)
697 OP '.' (1, 3) (1, 4)
698 NAME 'async' (1, 4) (1, 9)
699
700 >>> dump_tokens("async for a in b: pass")
701 ENCODING 'utf-8' (0, 0) (0, 0)
702 NAME 'async' (1, 0) (1, 5)
703 NAME 'for' (1, 6) (1, 9)
704 NAME 'a' (1, 10) (1, 11)
705 NAME 'in' (1, 12) (1, 14)
706 NAME 'b' (1, 15) (1, 16)
707 OP ':' (1, 16) (1, 17)
708 NAME 'pass' (1, 18) (1, 22)
709
710 >>> dump_tokens("async with a as b: pass")
711 ENCODING 'utf-8' (0, 0) (0, 0)
712 NAME 'async' (1, 0) (1, 5)
713 NAME 'with' (1, 6) (1, 10)
714 NAME 'a' (1, 11) (1, 12)
715 NAME 'as' (1, 13) (1, 15)
716 NAME 'b' (1, 16) (1, 17)
717 OP ':' (1, 17) (1, 18)
718 NAME 'pass' (1, 19) (1, 23)
719
720 >>> dump_tokens("async.foo")
721 ENCODING 'utf-8' (0, 0) (0, 0)
722 NAME 'async' (1, 0) (1, 5)
723 OP '.' (1, 5) (1, 6)
724 NAME 'foo' (1, 6) (1, 9)
725
726 >>> dump_tokens("async")
727 ENCODING 'utf-8' (0, 0) (0, 0)
728 NAME 'async' (1, 0) (1, 5)
729
730 >>> dump_tokens("async\\n#comment\\nawait")
731 ENCODING 'utf-8' (0, 0) (0, 0)
732 NAME 'async' (1, 0) (1, 5)
733 NEWLINE '\\n' (1, 5) (1, 6)
734 COMMENT '#comment' (2, 0) (2, 8)
735 NL '\\n' (2, 8) (2, 9)
736 NAME 'await' (3, 0) (3, 5)
737
738 >>> dump_tokens("async\\n...\\nawait")
739 ENCODING 'utf-8' (0, 0) (0, 0)
740 NAME 'async' (1, 0) (1, 5)
741 NEWLINE '\\n' (1, 5) (1, 6)
742 OP '...' (2, 0) (2, 3)
743 NEWLINE '\\n' (2, 3) (2, 4)
744 NAME 'await' (3, 0) (3, 5)
745
746 >>> dump_tokens("async\\nawait")
747 ENCODING 'utf-8' (0, 0) (0, 0)
748 NAME 'async' (1, 0) (1, 5)
749 NEWLINE '\\n' (1, 5) (1, 6)
750 NAME 'await' (2, 0) (2, 5)
751
752 >>> dump_tokens("foo.async + 1")
753 ENCODING 'utf-8' (0, 0) (0, 0)
754 NAME 'foo' (1, 0) (1, 3)
755 OP '.' (1, 3) (1, 4)
756 NAME 'async' (1, 4) (1, 9)
757 OP '+' (1, 10) (1, 11)
758 NUMBER '1' (1, 12) (1, 13)
759
760 >>> dump_tokens("async def foo(): pass")
761 ENCODING 'utf-8' (0, 0) (0, 0)
762 ASYNC 'async' (1, 0) (1, 5)
763 NAME 'def' (1, 6) (1, 9)
764 NAME 'foo' (1, 10) (1, 13)
765 OP '(' (1, 13) (1, 14)
766 OP ')' (1, 14) (1, 15)
767 OP ':' (1, 15) (1, 16)
768 NAME 'pass' (1, 17) (1, 21)
769
770 >>> dump_tokens('''async def foo():
771 ... def foo(await):
772 ... await = 1
773 ... if 1:
774 ... await
775 ... async += 1
776 ... ''')
777 ENCODING 'utf-8' (0, 0) (0, 0)
778 ASYNC 'async' (1, 0) (1, 5)
779 NAME 'def' (1, 6) (1, 9)
780 NAME 'foo' (1, 10) (1, 13)
781 OP '(' (1, 13) (1, 14)
782 OP ')' (1, 14) (1, 15)
783 OP ':' (1, 15) (1, 16)
784 NEWLINE '\\n' (1, 16) (1, 17)
785 INDENT ' ' (2, 0) (2, 2)
786 NAME 'def' (2, 2) (2, 5)
787 NAME 'foo' (2, 6) (2, 9)
788 OP '(' (2, 9) (2, 10)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300789 AWAIT 'await' (2, 10) (2, 15)
Yury Selivanov75445082015-05-11 22:57:16 -0400790 OP ')' (2, 15) (2, 16)
791 OP ':' (2, 16) (2, 17)
792 NEWLINE '\\n' (2, 17) (2, 18)
793 INDENT ' ' (3, 0) (3, 4)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300794 AWAIT 'await' (3, 4) (3, 9)
Yury Selivanov75445082015-05-11 22:57:16 -0400795 OP '=' (3, 10) (3, 11)
796 NUMBER '1' (3, 12) (3, 13)
797 NEWLINE '\\n' (3, 13) (3, 14)
798 DEDENT '' (4, 2) (4, 2)
799 NAME 'if' (4, 2) (4, 4)
800 NUMBER '1' (4, 5) (4, 6)
801 OP ':' (4, 6) (4, 7)
802 NEWLINE '\\n' (4, 7) (4, 8)
803 INDENT ' ' (5, 0) (5, 4)
804 AWAIT 'await' (5, 4) (5, 9)
805 NEWLINE '\\n' (5, 9) (5, 10)
806 DEDENT '' (6, 0) (6, 0)
807 DEDENT '' (6, 0) (6, 0)
808 NAME 'async' (6, 0) (6, 5)
809 OP '+=' (6, 6) (6, 8)
810 NUMBER '1' (6, 9) (6, 10)
811 NEWLINE '\\n' (6, 10) (6, 11)
812
813 >>> dump_tokens('''async def foo():
814 ... async for i in 1: pass''')
815 ENCODING 'utf-8' (0, 0) (0, 0)
816 ASYNC 'async' (1, 0) (1, 5)
817 NAME 'def' (1, 6) (1, 9)
818 NAME 'foo' (1, 10) (1, 13)
819 OP '(' (1, 13) (1, 14)
820 OP ')' (1, 14) (1, 15)
821 OP ':' (1, 15) (1, 16)
822 NEWLINE '\\n' (1, 16) (1, 17)
823 INDENT ' ' (2, 0) (2, 2)
824 ASYNC 'async' (2, 2) (2, 7)
825 NAME 'for' (2, 8) (2, 11)
826 NAME 'i' (2, 12) (2, 13)
827 NAME 'in' (2, 14) (2, 16)
828 NUMBER '1' (2, 17) (2, 18)
829 OP ':' (2, 18) (2, 19)
830 NAME 'pass' (2, 20) (2, 24)
831 DEDENT '' (3, 0) (3, 0)
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300832
833 >>> dump_tokens('''async def foo(async): await''')
834 ENCODING 'utf-8' (0, 0) (0, 0)
835 ASYNC 'async' (1, 0) (1, 5)
836 NAME 'def' (1, 6) (1, 9)
837 NAME 'foo' (1, 10) (1, 13)
838 OP '(' (1, 13) (1, 14)
839 ASYNC 'async' (1, 14) (1, 19)
840 OP ')' (1, 19) (1, 20)
841 OP ':' (1, 20) (1, 21)
842 AWAIT 'await' (1, 22) (1, 27)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300843
844 >>> dump_tokens('''def f():
845 ...
846 ... def baz(): pass
847 ... async def bar(): pass
848 ...
849 ... await = 2''')
850 ENCODING 'utf-8' (0, 0) (0, 0)
851 NAME 'def' (1, 0) (1, 3)
852 NAME 'f' (1, 4) (1, 5)
853 OP '(' (1, 5) (1, 6)
854 OP ')' (1, 6) (1, 7)
855 OP ':' (1, 7) (1, 8)
856 NEWLINE '\\n' (1, 8) (1, 9)
857 NL '\\n' (2, 0) (2, 1)
858 INDENT ' ' (3, 0) (3, 2)
859 NAME 'def' (3, 2) (3, 5)
860 NAME 'baz' (3, 6) (3, 9)
861 OP '(' (3, 9) (3, 10)
862 OP ')' (3, 10) (3, 11)
863 OP ':' (3, 11) (3, 12)
864 NAME 'pass' (3, 13) (3, 17)
865 NEWLINE '\\n' (3, 17) (3, 18)
866 ASYNC 'async' (4, 2) (4, 7)
867 NAME 'def' (4, 8) (4, 11)
868 NAME 'bar' (4, 12) (4, 15)
869 OP '(' (4, 15) (4, 16)
870 OP ')' (4, 16) (4, 17)
871 OP ':' (4, 17) (4, 18)
872 NAME 'pass' (4, 19) (4, 23)
873 NEWLINE '\\n' (4, 23) (4, 24)
874 NL '\\n' (5, 0) (5, 1)
875 NAME 'await' (6, 2) (6, 7)
876 OP '=' (6, 8) (6, 9)
877 NUMBER '2' (6, 10) (6, 11)
878 DEDENT '' (7, 0) (7, 0)
879
880 >>> dump_tokens('''async def f():
881 ...
882 ... def baz(): pass
883 ... async def bar(): pass
884 ...
885 ... await = 2''')
886 ENCODING 'utf-8' (0, 0) (0, 0)
887 ASYNC 'async' (1, 0) (1, 5)
888 NAME 'def' (1, 6) (1, 9)
889 NAME 'f' (1, 10) (1, 11)
890 OP '(' (1, 11) (1, 12)
891 OP ')' (1, 12) (1, 13)
892 OP ':' (1, 13) (1, 14)
893 NEWLINE '\\n' (1, 14) (1, 15)
894 NL '\\n' (2, 0) (2, 1)
895 INDENT ' ' (3, 0) (3, 2)
896 NAME 'def' (3, 2) (3, 5)
897 NAME 'baz' (3, 6) (3, 9)
898 OP '(' (3, 9) (3, 10)
899 OP ')' (3, 10) (3, 11)
900 OP ':' (3, 11) (3, 12)
901 NAME 'pass' (3, 13) (3, 17)
902 NEWLINE '\\n' (3, 17) (3, 18)
903 ASYNC 'async' (4, 2) (4, 7)
904 NAME 'def' (4, 8) (4, 11)
905 NAME 'bar' (4, 12) (4, 15)
906 OP '(' (4, 15) (4, 16)
907 OP ')' (4, 16) (4, 17)
908 OP ':' (4, 17) (4, 18)
909 NAME 'pass' (4, 19) (4, 23)
910 NEWLINE '\\n' (4, 23) (4, 24)
911 NL '\\n' (5, 0) (5, 1)
912 AWAIT 'await' (6, 2) (6, 7)
913 OP '=' (6, 8) (6, 9)
914 NUMBER '2' (6, 10) (6, 11)
915 DEDENT '' (7, 0) (7, 0)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000916"""
917
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000918from test import support
Trent Nelson428de652008-03-18 22:41:35 +0000919from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
Meador Inge00c7f852012-01-19 00:44:45 -0600920 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
Terry Jan Reedy5e6db312014-02-17 16:45:48 -0500921 open as tokenize_open, Untokenizer)
Trent Nelson428de652008-03-18 22:41:35 +0000922from io import BytesIO
Victor Stinner387729e2015-05-26 00:43:58 +0200923from unittest import TestCase, mock
Jason R. Coombs7cf36382015-06-20 19:13:50 -0400924import os
Meador Inge00c7f852012-01-19 00:44:45 -0600925import token
Raymond Hettinger68c04532005-06-10 11:05:19 +0000926
Thomas Wouters89f507f2006-12-13 04:49:30 +0000927def dump_tokens(s):
928 """Print out the tokens in s in a table format.
929
930 The ENDMARKER is omitted.
931 """
Trent Nelson428de652008-03-18 22:41:35 +0000932 f = BytesIO(s.encode('utf-8'))
933 for type, token, start, end, line in tokenize(f.readline):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000934 if type == ENDMARKER:
935 break
936 type = tok_name[type]
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000937 print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters89f507f2006-12-13 04:49:30 +0000938
Trent Nelson428de652008-03-18 22:41:35 +0000939def roundtrip(f):
940 """
941 Test roundtrip for `untokenize`. `f` is an open file or a string.
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500942 The source code in f is tokenized to both 5- and 2-tuples.
943 Both sequences are converted back to source code via
944 tokenize.untokenize(), and the latter tokenized again to 2-tuples.
945 The test fails if the 3 pair tokenizations do not match.
946
947 When untokenize bugs are fixed, untokenize with 5-tuples should
948 reproduce code that does not contain a backslash continuation
949 following spaces. A proper test should test this.
950
951 This function would be more useful for correcting bugs if it reported
952 the first point of failure, like assertEqual, rather than just
953 returning False -- or if it were only used in unittests and not
954 doctest and actually used assertEqual.
Trent Nelson428de652008-03-18 22:41:35 +0000955 """
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500956 # Get source code and original tokenizations
Trent Nelson428de652008-03-18 22:41:35 +0000957 if isinstance(f, str):
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500958 code = f.encode('utf-8')
959 else:
960 code = f.read()
Brian Curtin9f5f65c2010-10-30 21:35:28 +0000961 f.close()
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500962 readline = iter(code.splitlines(keepends=True)).__next__
963 tokens5 = list(tokenize(readline))
964 tokens2 = [tok[:2] for tok in tokens5]
965 # Reproduce tokens2 from pairs
966 bytes_from2 = untokenize(tokens2)
967 readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__
968 tokens2_from2 = [tok[:2] for tok in tokenize(readline2)]
969 # Reproduce tokens2 from 5-tuples
970 bytes_from5 = untokenize(tokens5)
971 readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__
972 tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
973 # Compare 3 versions
974 return tokens2 == tokens2_from2 == tokens2_from5
Thomas Wouters89f507f2006-12-13 04:49:30 +0000975
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000976# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000977def decistmt(s):
978 """Substitute Decimals for floats in a string of statements.
979
980 >>> from decimal import Decimal
Georg Brandl88fc6642007-02-09 21:28:07 +0000981 >>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000982 >>> decistmt(s)
Georg Brandl88fc6642007-02-09 21:28:07 +0000983 "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger68c04532005-06-10 11:05:19 +0000984
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000985 The format of the exponent is inherited from the platform C library.
986 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
Mark Dickinson388122d2010-08-04 20:56:28 +0000987 we're only showing 11 digits, and the 12th isn't close to 5, the
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000988 rest of the output should be platform-independent.
989
990 >>> exec(s) #doctest: +ELLIPSIS
Mark Dickinson388122d2010-08-04 20:56:28 +0000991 -3.2171603427...e-0...7
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000992
993 Output from calculations with Decimal should be identical across all
994 platforms.
995
Raymond Hettinger68c04532005-06-10 11:05:19 +0000996 >>> exec(decistmt(s))
997 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +0000998 """
999 result = []
Trent Nelson428de652008-03-18 22:41:35 +00001000 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
Raymond Hettinger68c04532005-06-10 11:05:19 +00001001 for toknum, tokval, _, _, _ in g:
1002 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
1003 result.extend([
1004 (NAME, 'Decimal'),
1005 (OP, '('),
1006 (STRING, repr(tokval)),
1007 (OP, ')')
1008 ])
1009 else:
1010 result.append((toknum, tokval))
Trent Nelson428de652008-03-18 22:41:35 +00001011 return untokenize(result).decode('utf-8')
1012
1013
1014class TestTokenizerAdheresToPep0263(TestCase):
1015 """
1016 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
1017 """
1018
1019 def _testFile(self, filename):
1020 path = os.path.join(os.path.dirname(__file__), filename)
1021 return roundtrip(open(path, 'rb'))
1022
1023 def test_utf8_coding_cookie_and_no_utf8_bom(self):
Ned Deily2ea6fcc2011-07-19 16:15:27 -07001024 f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
Trent Nelson428de652008-03-18 22:41:35 +00001025 self.assertTrue(self._testFile(f))
1026
1027 def test_latin1_coding_cookie_and_utf8_bom(self):
1028 """
1029 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
1030 allowed encoding for the comment is 'utf-8'. The text file used in
1031 this test starts with a BOM signature, but specifies latin1 as the
1032 coding, so verify that a SyntaxError is raised, which matches the
1033 behaviour of the interpreter when it encounters a similar condition.
1034 """
1035 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
Benjamin Petersonc9c0f202009-06-30 23:06:06 +00001036 self.assertRaises(SyntaxError, self._testFile, f)
Trent Nelson428de652008-03-18 22:41:35 +00001037
1038 def test_no_coding_cookie_and_utf8_bom(self):
1039 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
1040 self.assertTrue(self._testFile(f))
1041
1042 def test_utf8_coding_cookie_and_utf8_bom(self):
1043 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
1044 self.assertTrue(self._testFile(f))
1045
Florent Xicluna11f0b412012-07-07 12:13:35 +02001046 def test_bad_coding_cookie(self):
1047 self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')
1048 self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')
1049
Trent Nelson428de652008-03-18 22:41:35 +00001050
1051class Test_Tokenize(TestCase):
1052
1053 def test__tokenize_decodes_with_specified_encoding(self):
1054 literal = '"ЉЊЈЁЂ"'
1055 line = literal.encode('utf-8')
1056 first = False
1057 def readline():
1058 nonlocal first
1059 if not first:
1060 first = True
1061 return line
1062 else:
1063 return b''
1064
1065 # skip the initial encoding token and the end token
1066 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
1067 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +00001068 self.assertEqual(tokens, expected_tokens,
1069 "bytes not decoded with encoding")
Trent Nelson428de652008-03-18 22:41:35 +00001070
1071 def test__tokenize_does_not_decode_with_encoding_none(self):
1072 literal = '"ЉЊЈЁЂ"'
1073 first = False
1074 def readline():
1075 nonlocal first
1076 if not first:
1077 first = True
1078 return literal
1079 else:
1080 return b''
1081
1082 # skip the end token
1083 tokens = list(_tokenize(readline, encoding=None))[:-1]
1084 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +00001085 self.assertEqual(tokens, expected_tokens,
1086 "string not tokenized when encoding is None")
Trent Nelson428de652008-03-18 22:41:35 +00001087
1088
1089class TestDetectEncoding(TestCase):
1090
1091 def get_readline(self, lines):
1092 index = 0
1093 def readline():
1094 nonlocal index
1095 if index == len(lines):
1096 raise StopIteration
1097 line = lines[index]
1098 index += 1
1099 return line
1100 return readline
1101
1102 def test_no_bom_no_encoding_cookie(self):
1103 lines = (
1104 b'# something\n',
1105 b'print(something)\n',
1106 b'do_something(else)\n'
1107 )
1108 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001109 self.assertEqual(encoding, 'utf-8')
1110 self.assertEqual(consumed_lines, list(lines[:2]))
Trent Nelson428de652008-03-18 22:41:35 +00001111
1112 def test_bom_no_cookie(self):
1113 lines = (
1114 b'\xef\xbb\xbf# something\n',
1115 b'print(something)\n',
1116 b'do_something(else)\n'
1117 )
1118 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001119 self.assertEqual(encoding, 'utf-8-sig')
1120 self.assertEqual(consumed_lines,
1121 [b'# something\n', b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001122
1123 def test_cookie_first_line_no_bom(self):
1124 lines = (
1125 b'# -*- coding: latin-1 -*-\n',
1126 b'print(something)\n',
1127 b'do_something(else)\n'
1128 )
1129 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001130 self.assertEqual(encoding, 'iso-8859-1')
1131 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001132
1133 def test_matched_bom_and_cookie_first_line(self):
1134 lines = (
1135 b'\xef\xbb\xbf# coding=utf-8\n',
1136 b'print(something)\n',
1137 b'do_something(else)\n'
1138 )
1139 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001140 self.assertEqual(encoding, 'utf-8-sig')
1141 self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001142
1143 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
1144 lines = (
1145 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
1146 b'print(something)\n',
1147 b'do_something(else)\n'
1148 )
1149 readline = self.get_readline(lines)
1150 self.assertRaises(SyntaxError, detect_encoding, readline)
1151
1152 def test_cookie_second_line_no_bom(self):
1153 lines = (
1154 b'#! something\n',
1155 b'# vim: set fileencoding=ascii :\n',
1156 b'print(something)\n',
1157 b'do_something(else)\n'
1158 )
1159 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001160 self.assertEqual(encoding, 'ascii')
Trent Nelson428de652008-03-18 22:41:35 +00001161 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
Ezio Melottib3aedd42010-11-20 19:04:17 +00001162 self.assertEqual(consumed_lines, expected)
Trent Nelson428de652008-03-18 22:41:35 +00001163
1164 def test_matched_bom_and_cookie_second_line(self):
1165 lines = (
1166 b'\xef\xbb\xbf#! something\n',
1167 b'f# coding=utf-8\n',
1168 b'print(something)\n',
1169 b'do_something(else)\n'
1170 )
1171 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001172 self.assertEqual(encoding, 'utf-8-sig')
1173 self.assertEqual(consumed_lines,
1174 [b'#! something\n', b'f# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001175
1176 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
1177 lines = (
1178 b'\xef\xbb\xbf#! something\n',
1179 b'# vim: set fileencoding=ascii :\n',
1180 b'print(something)\n',
1181 b'do_something(else)\n'
1182 )
1183 readline = self.get_readline(lines)
1184 self.assertRaises(SyntaxError, detect_encoding, readline)
1185
Serhiy Storchaka768c16c2014-01-09 18:36:09 +02001186 def test_cookie_second_line_noncommented_first_line(self):
1187 lines = (
1188 b"print('\xc2\xa3')\n",
1189 b'# vim: set fileencoding=iso8859-15 :\n',
1190 b"print('\xe2\x82\xac')\n"
1191 )
1192 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1193 self.assertEqual(encoding, 'utf-8')
1194 expected = [b"print('\xc2\xa3')\n"]
1195 self.assertEqual(consumed_lines, expected)
1196
1197 def test_cookie_second_line_commented_first_line(self):
1198 lines = (
1199 b"#print('\xc2\xa3')\n",
1200 b'# vim: set fileencoding=iso8859-15 :\n',
1201 b"print('\xe2\x82\xac')\n"
1202 )
1203 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1204 self.assertEqual(encoding, 'iso8859-15')
1205 expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
1206 self.assertEqual(consumed_lines, expected)
1207
1208 def test_cookie_second_line_empty_first_line(self):
1209 lines = (
1210 b'\n',
1211 b'# vim: set fileencoding=iso8859-15 :\n',
1212 b"print('\xe2\x82\xac')\n"
1213 )
1214 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1215 self.assertEqual(encoding, 'iso8859-15')
1216 expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
1217 self.assertEqual(consumed_lines, expected)
1218
Benjamin Petersond3afada2009-10-09 21:43:09 +00001219 def test_latin1_normalization(self):
1220 # See get_normal_name() in tokenizer.c.
1221 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
1222 "iso-8859-1-unix", "iso-latin-1-mac")
1223 for encoding in encodings:
1224 for rep in ("-", "_"):
1225 enc = encoding.replace("-", rep)
1226 lines = (b"#!/usr/bin/python\n",
1227 b"# coding: " + enc.encode("ascii") + b"\n",
1228 b"print(things)\n",
1229 b"do_something += 4\n")
1230 rl = self.get_readline(lines)
1231 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001232 self.assertEqual(found, "iso-8859-1")
Benjamin Petersond3afada2009-10-09 21:43:09 +00001233
Martin v. Löwis63674f42012-04-20 14:36:47 +02001234 def test_syntaxerror_latin1(self):
1235 # Issue 14629: need to raise SyntaxError if the first
1236 # line(s) have non-UTF-8 characters
1237 lines = (
1238 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1239 )
1240 readline = self.get_readline(lines)
1241 self.assertRaises(SyntaxError, detect_encoding, readline)
1242
1243
Benjamin Petersond3afada2009-10-09 21:43:09 +00001244 def test_utf8_normalization(self):
1245 # See get_normal_name() in tokenizer.c.
1246 encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
1247 for encoding in encodings:
1248 for rep in ("-", "_"):
1249 enc = encoding.replace("-", rep)
1250 lines = (b"#!/usr/bin/python\n",
1251 b"# coding: " + enc.encode("ascii") + b"\n",
1252 b"1 + 3\n")
1253 rl = self.get_readline(lines)
1254 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001255 self.assertEqual(found, "utf-8")
Benjamin Petersond3afada2009-10-09 21:43:09 +00001256
Trent Nelson428de652008-03-18 22:41:35 +00001257 def test_short_files(self):
1258 readline = self.get_readline((b'print(something)\n',))
1259 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001260 self.assertEqual(encoding, 'utf-8')
1261 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001262
1263 encoding, consumed_lines = detect_encoding(self.get_readline(()))
Ezio Melottib3aedd42010-11-20 19:04:17 +00001264 self.assertEqual(encoding, 'utf-8')
1265 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +00001266
1267 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
1268 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001269 self.assertEqual(encoding, 'utf-8-sig')
1270 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +00001271
1272 readline = self.get_readline((b'\xef\xbb\xbf',))
1273 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001274 self.assertEqual(encoding, 'utf-8-sig')
1275 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +00001276
Benjamin Peterson433f32c2008-12-12 01:25:05 +00001277 readline = self.get_readline((b'# coding: bad\n',))
1278 self.assertRaises(SyntaxError, detect_encoding, readline)
Trent Nelson428de652008-03-18 22:41:35 +00001279
Serhiy Storchakadafea852013-09-16 23:51:56 +03001280 def test_false_encoding(self):
1281 # Issue 18873: "Encoding" detected in non-comment lines
1282 readline = self.get_readline((b'print("#coding=fake")',))
1283 encoding, consumed_lines = detect_encoding(readline)
1284 self.assertEqual(encoding, 'utf-8')
1285 self.assertEqual(consumed_lines, [b'print("#coding=fake")'])
1286
Victor Stinner58c07522010-11-09 01:08:59 +00001287 def test_open(self):
1288 filename = support.TESTFN + '.py'
1289 self.addCleanup(support.unlink, filename)
1290
1291 # test coding cookie
1292 for encoding in ('iso-8859-15', 'utf-8'):
1293 with open(filename, 'w', encoding=encoding) as fp:
1294 print("# coding: %s" % encoding, file=fp)
1295 print("print('euro:\u20ac')", file=fp)
1296 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001297 self.assertEqual(fp.encoding, encoding)
1298 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001299
1300 # test BOM (no coding cookie)
1301 with open(filename, 'w', encoding='utf-8-sig') as fp:
1302 print("print('euro:\u20ac')", file=fp)
1303 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001304 self.assertEqual(fp.encoding, 'utf-8-sig')
1305 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001306
Brett Cannonc33f3f22012-04-20 13:23:54 -04001307 def test_filename_in_exception(self):
1308 # When possible, include the file name in the exception.
1309 path = 'some_file_path'
1310 lines = (
1311 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1312 )
1313 class Bunk:
1314 def __init__(self, lines, path):
1315 self.name = path
1316 self._lines = lines
1317 self._index = 0
1318
1319 def readline(self):
1320 if self._index == len(lines):
1321 raise StopIteration
1322 line = lines[self._index]
1323 self._index += 1
1324 return line
1325
1326 with self.assertRaises(SyntaxError):
1327 ins = Bunk(lines, path)
1328 # Make sure lacking a name isn't an issue.
1329 del ins.name
1330 detect_encoding(ins.readline)
1331 with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
1332 ins = Bunk(lines, path)
1333 detect_encoding(ins.readline)
1334
Victor Stinner387729e2015-05-26 00:43:58 +02001335 def test_open_error(self):
1336 # Issue #23840: open() must close the binary file on error
1337 m = BytesIO(b'#coding:xxx')
1338 with mock.patch('tokenize._builtin_open', return_value=m):
1339 self.assertRaises(SyntaxError, tokenize_open, 'foobar')
1340 self.assertTrue(m.closed)
1341
1342
Brett Cannonc33f3f22012-04-20 13:23:54 -04001343
Trent Nelson428de652008-03-18 22:41:35 +00001344class TestTokenize(TestCase):
1345
1346 def test_tokenize(self):
1347 import tokenize as tokenize_module
1348 encoding = object()
1349 encoding_used = None
1350 def mock_detect_encoding(readline):
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001351 return encoding, [b'first', b'second']
Trent Nelson428de652008-03-18 22:41:35 +00001352
1353 def mock__tokenize(readline, encoding):
1354 nonlocal encoding_used
1355 encoding_used = encoding
1356 out = []
1357 while True:
1358 next_line = readline()
1359 if next_line:
1360 out.append(next_line)
1361 continue
1362 return out
1363
1364 counter = 0
1365 def mock_readline():
1366 nonlocal counter
1367 counter += 1
1368 if counter == 5:
1369 return b''
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001370 return str(counter).encode()
Trent Nelson428de652008-03-18 22:41:35 +00001371
1372 orig_detect_encoding = tokenize_module.detect_encoding
1373 orig__tokenize = tokenize_module._tokenize
1374 tokenize_module.detect_encoding = mock_detect_encoding
1375 tokenize_module._tokenize = mock__tokenize
1376 try:
1377 results = tokenize(mock_readline)
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001378 self.assertEqual(list(results),
1379 [b'first', b'second', b'1', b'2', b'3', b'4'])
Trent Nelson428de652008-03-18 22:41:35 +00001380 finally:
1381 tokenize_module.detect_encoding = orig_detect_encoding
1382 tokenize_module._tokenize = orig__tokenize
1383
1384 self.assertTrue(encoding_used, encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +00001385
Yury Selivanov8085b802015-05-18 12:50:52 -04001386 def test_oneline_defs(self):
1387 buf = []
1388 for i in range(500):
1389 buf.append('def i{i}(): return {i}'.format(i=i))
1390 buf.append('OK')
1391 buf = '\n'.join(buf)
1392
1393 # Test that 500 consequent, one-line defs is OK
1394 toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
1395 self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER
1396
Meador Inge00c7f852012-01-19 00:44:45 -06001397 def assertExactTypeEqual(self, opstr, *optypes):
1398 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
1399 num_optypes = len(optypes)
1400 self.assertEqual(len(tokens), 2 + num_optypes)
1401 self.assertEqual(token.tok_name[tokens[0].exact_type],
1402 token.tok_name[ENCODING])
1403 for i in range(num_optypes):
1404 self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
1405 token.tok_name[optypes[i]])
1406 self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
1407 token.tok_name[token.ENDMARKER])
1408
1409 def test_exact_type(self):
1410 self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
1411 self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
1412 self.assertExactTypeEqual(':', token.COLON)
1413 self.assertExactTypeEqual(',', token.COMMA)
1414 self.assertExactTypeEqual(';', token.SEMI)
1415 self.assertExactTypeEqual('+', token.PLUS)
1416 self.assertExactTypeEqual('-', token.MINUS)
1417 self.assertExactTypeEqual('*', token.STAR)
1418 self.assertExactTypeEqual('/', token.SLASH)
1419 self.assertExactTypeEqual('|', token.VBAR)
1420 self.assertExactTypeEqual('&', token.AMPER)
1421 self.assertExactTypeEqual('<', token.LESS)
1422 self.assertExactTypeEqual('>', token.GREATER)
1423 self.assertExactTypeEqual('=', token.EQUAL)
1424 self.assertExactTypeEqual('.', token.DOT)
1425 self.assertExactTypeEqual('%', token.PERCENT)
1426 self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
1427 self.assertExactTypeEqual('==', token.EQEQUAL)
1428 self.assertExactTypeEqual('!=', token.NOTEQUAL)
1429 self.assertExactTypeEqual('<=', token.LESSEQUAL)
1430 self.assertExactTypeEqual('>=', token.GREATEREQUAL)
1431 self.assertExactTypeEqual('~', token.TILDE)
1432 self.assertExactTypeEqual('^', token.CIRCUMFLEX)
1433 self.assertExactTypeEqual('<<', token.LEFTSHIFT)
1434 self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
1435 self.assertExactTypeEqual('**', token.DOUBLESTAR)
1436 self.assertExactTypeEqual('+=', token.PLUSEQUAL)
1437 self.assertExactTypeEqual('-=', token.MINEQUAL)
1438 self.assertExactTypeEqual('*=', token.STAREQUAL)
1439 self.assertExactTypeEqual('/=', token.SLASHEQUAL)
1440 self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
1441 self.assertExactTypeEqual('&=', token.AMPEREQUAL)
1442 self.assertExactTypeEqual('|=', token.VBAREQUAL)
1443 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1444 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1445 self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
1446 self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
1447 self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
1448 self.assertExactTypeEqual('//', token.DOUBLESLASH)
1449 self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
1450 self.assertExactTypeEqual('@', token.AT)
Benjamin Petersond51374e2014-04-09 23:55:56 -04001451 self.assertExactTypeEqual('@=', token.ATEQUAL)
Meador Inge00c7f852012-01-19 00:44:45 -06001452
1453 self.assertExactTypeEqual('a**2+b**2==c**2',
1454 NAME, token.DOUBLESTAR, NUMBER,
1455 token.PLUS,
1456 NAME, token.DOUBLESTAR, NUMBER,
1457 token.EQEQUAL,
1458 NAME, token.DOUBLESTAR, NUMBER)
1459 self.assertExactTypeEqual('{1, 2, 3}',
1460 token.LBRACE,
1461 token.NUMBER, token.COMMA,
1462 token.NUMBER, token.COMMA,
1463 token.NUMBER,
1464 token.RBRACE)
1465 self.assertExactTypeEqual('^(x & 0x1)',
1466 token.CIRCUMFLEX,
1467 token.LPAR,
1468 token.NAME, token.AMPER, token.NUMBER,
1469 token.RPAR)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001470
Ezio Melottifafa8b72012-11-03 17:46:51 +02001471 def test_pathological_trailing_whitespace(self):
1472 # See http://bugs.python.org/issue16152
1473 self.assertExactTypeEqual('@ ', token.AT)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001474
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001475class UntokenizeTest(TestCase):
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001476
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001477 def test_bad_input_order(self):
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001478 # raise if previous row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001479 u = Untokenizer()
1480 u.prev_row = 2
1481 u.prev_col = 2
1482 with self.assertRaises(ValueError) as cm:
1483 u.add_whitespace((1,3))
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001484 self.assertEqual(cm.exception.args[0],
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001485 'start (1,3) precedes previous end (2,2)')
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001486 # raise if previous column in row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001487 self.assertRaises(ValueError, u.add_whitespace, (2,1))
1488
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001489 def test_backslash_continuation(self):
1490 # The problem is that <whitespace>\<newline> leaves no token
1491 u = Untokenizer()
1492 u.prev_row = 1
1493 u.prev_col = 1
1494 u.tokens = []
1495 u.add_whitespace((2, 0))
1496 self.assertEqual(u.tokens, ['\\\n'])
1497 u.prev_row = 2
1498 u.add_whitespace((4, 4))
1499 self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' '])
1500 self.assertTrue(roundtrip('a\n b\n c\n \\\n c\n'))
1501
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -05001502 def test_iter_compat(self):
1503 u = Untokenizer()
1504 token = (NAME, 'Hello')
1505 tokens = [(ENCODING, 'utf-8'), token]
1506 u.compat(token, iter([]))
1507 self.assertEqual(u.tokens, ["Hello "])
1508 u = Untokenizer()
1509 self.assertEqual(u.untokenize(iter([token])), 'Hello ')
1510 u = Untokenizer()
1511 self.assertEqual(u.untokenize(iter(tokens)), 'Hello ')
1512 self.assertEqual(u.encoding, 'utf-8')
1513 self.assertEqual(untokenize(iter(tokens)), b'Hello ')
1514
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001515
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001516class TestRoundtrip(TestCase):
1517 def roundtrip(self, code):
1518 if isinstance(code, str):
1519 code = code.encode('utf-8')
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001520 return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001521
1522 def test_indentation_semantics_retained(self):
1523 """
1524 Ensure that although whitespace might be mutated in a roundtrip,
1525 the semantic meaning of the indentation remains consistent.
1526 """
1527 code = "if False:\n\tx=3\n\tx=3\n"
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001528 codelines = self.roundtrip(code).split('\n')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001529 self.assertEqual(codelines[1], codelines[2])
1530
1531
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001532__test__ = {"doctests" : doctests, 'decistmt': decistmt}
1533
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001534def test_main():
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001535 from test import test_tokenize
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001536 support.run_doctest(test_tokenize, True)
1537 support.run_unittest(TestTokenizerAdheresToPep0263)
1538 support.run_unittest(Test_Tokenize)
1539 support.run_unittest(TestDetectEncoding)
1540 support.run_unittest(TestTokenize)
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001541 support.run_unittest(UntokenizeTest)
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001542 support.run_unittest(TestRoundtrip)
Neal Norwitzc1505362006-12-28 06:47:50 +00001543
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001544if __name__ == "__main__":
1545 test_main()