blob: 17650855eb33b5a702be8c614d3ecf7101a98fa5 [file] [log] [blame]
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001doctests = """
2Tests for the tokenize module.
Thomas Wouters89f507f2006-12-13 04:49:30 +00003
Christian Heimesdd15f6c2008-03-16 00:07:10 +00004The tests can be really simple. Given a small fragment of source
Eric Smith74ca5572008-03-17 19:49:19 +00005code, print out a table with tokens. The ENDMARK is omitted for
Thomas Wouters89f507f2006-12-13 04:49:30 +00006brevity.
7
Christian Heimesdd15f6c2008-03-16 00:07:10 +00008 >>> dump_tokens("1 + 1")
Trent Nelson428de652008-03-18 22:41:35 +00009 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010 NUMBER '1' (1, 0) (1, 1)
11 OP '+' (1, 2) (1, 3)
12 NUMBER '1' (1, 4) (1, 5)
Thomas Wouters89f507f2006-12-13 04:49:30 +000013
Christian Heimesdd15f6c2008-03-16 00:07:10 +000014 >>> dump_tokens("if False:\\n"
15 ... " # NL\\n"
16 ... " True = False # NEWLINE\\n")
Trent Nelson428de652008-03-18 22:41:35 +000017 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000018 NAME 'if' (1, 0) (1, 2)
19 NAME 'False' (1, 3) (1, 8)
20 OP ':' (1, 8) (1, 9)
21 NEWLINE '\\n' (1, 9) (1, 10)
22 COMMENT '# NL' (2, 4) (2, 8)
23 NL '\\n' (2, 8) (2, 9)
24 INDENT ' ' (3, 0) (3, 4)
25 NAME 'True' (3, 4) (3, 8)
26 OP '=' (3, 9) (3, 10)
27 NAME 'False' (3, 11) (3, 16)
28 COMMENT '# NEWLINE' (3, 17) (3, 26)
29 NEWLINE '\\n' (3, 26) (3, 27)
30 DEDENT '' (4, 0) (4, 0)
Thomas Wouters89f507f2006-12-13 04:49:30 +000031
Christian Heimesdd15f6c2008-03-16 00:07:10 +000032 >>> indent_error_file = \"""
33 ... def k(x):
34 ... x += 2
35 ... x += 5
36 ... \"""
Trent Nelson428de652008-03-18 22:41:35 +000037 >>> readline = BytesIO(indent_error_file.encode('utf-8')).readline
38 >>> for tok in tokenize(readline): pass
Christian Heimesdd15f6c2008-03-16 00:07:10 +000039 Traceback (most recent call last):
40 ...
41 IndentationError: unindent does not match any outer indentation level
Thomas Wouters89f507f2006-12-13 04:49:30 +000042
Mark Dickinson3c0b3172010-06-29 07:38:37 +000043There are some standard formatting practices that are easy to get right.
Thomas Wouters89f507f2006-12-13 04:49:30 +000044
Christian Heimesdd15f6c2008-03-16 00:07:10 +000045 >>> roundtrip("if x == 1:\\n"
46 ... " print(x)\\n")
47 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000048
Christian Heimesdd15f6c2008-03-16 00:07:10 +000049 >>> roundtrip("# This is a comment\\n# This also")
50 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000051
52Some people use different formatting conventions, which makes
Christian Heimesdd15f6c2008-03-16 00:07:10 +000053untokenize a little trickier. Note that this test involves trailing
54whitespace after the colon. Note that we use hex escapes to make the
Trent Nelson428de652008-03-18 22:41:35 +000055two trailing blanks apparent in the expected output.
Thomas Wouters89f507f2006-12-13 04:49:30 +000056
Christian Heimesdd15f6c2008-03-16 00:07:10 +000057 >>> roundtrip("if x == 1 : \\n"
58 ... " print(x)\\n")
59 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000060
Benjamin Petersonee8712c2008-05-20 21:35:26 +000061 >>> f = support.findfile("tokenize_tests.txt")
Trent Nelson428de652008-03-18 22:41:35 +000062 >>> roundtrip(open(f, 'rb'))
Christian Heimesdd15f6c2008-03-16 00:07:10 +000063 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000064
Christian Heimesdd15f6c2008-03-16 00:07:10 +000065 >>> roundtrip("if x == 1:\\n"
66 ... " # A comment by itself.\\n"
67 ... " print(x) # Comment here, too.\\n"
68 ... " # Another comment.\\n"
69 ... "after_if = True\\n")
70 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000071
Christian Heimesdd15f6c2008-03-16 00:07:10 +000072 >>> roundtrip("if (x # The comments need to go in the right place\\n"
73 ... " == 1):\\n"
74 ... " print('x==1')\\n")
75 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000076
Christian Heimesdd15f6c2008-03-16 00:07:10 +000077 >>> roundtrip("class Test: # A comment here\\n"
78 ... " # A comment with weird indent\\n"
79 ... " after_com = 5\\n"
80 ... " def x(m): return m*5 # a one liner\\n"
81 ... " def y(m): # A whitespace after the colon\\n"
82 ... " return y*4 # 3-space indent\\n")
83 True
84
85Some error-handling code
86
87 >>> roundtrip("try: import somemodule\\n"
88 ... "except ImportError: # comment\\n"
Christian Heimesba4af492008-03-28 00:55:15 +000089 ... " print('Can not import' # comment2\\n)"
Neal Norwitz752abd02008-05-13 04:55:24 +000090 ... "else: print('Loaded')\\n")
Christian Heimesdd15f6c2008-03-16 00:07:10 +000091 True
92
Eric Smith74ca5572008-03-17 19:49:19 +000093Balancing continuation
Christian Heimesdd15f6c2008-03-16 00:07:10 +000094
95 >>> roundtrip("a = (3,4, \\n"
96 ... "5,6)\\n"
97 ... "y = [3, 4,\\n"
98 ... "5]\\n"
99 ... "z = {'a': 5,\\n"
100 ... "'b':15, 'c':True}\\n"
101 ... "x = len(y) + 5 - a[\\n"
102 ... "3] - a[2]\\n"
103 ... "+ len(z) - z[\\n"
104 ... "'b']\\n")
105 True
106
107Ordinary integers and binary operators
108
109 >>> dump_tokens("0xff <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000110 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000111 NUMBER '0xff' (1, 0) (1, 4)
112 OP '<=' (1, 5) (1, 7)
113 NUMBER '255' (1, 8) (1, 11)
Eric Smith74ca5572008-03-17 19:49:19 +0000114 >>> dump_tokens("0b10 <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000115 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000116 NUMBER '0b10' (1, 0) (1, 4)
117 OP '<=' (1, 5) (1, 7)
118 NUMBER '255' (1, 8) (1, 11)
119 >>> dump_tokens("0o123 <= 0O123")
Trent Nelson428de652008-03-18 22:41:35 +0000120 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000121 NUMBER '0o123' (1, 0) (1, 5)
122 OP '<=' (1, 6) (1, 8)
123 NUMBER '0O123' (1, 9) (1, 14)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000124 >>> dump_tokens("1234567 > ~0x15")
Trent Nelson428de652008-03-18 22:41:35 +0000125 ENCODING 'utf-8' (0, 0) (0, 0)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000126 NUMBER '1234567' (1, 0) (1, 7)
127 OP '>' (1, 8) (1, 9)
128 OP '~' (1, 10) (1, 11)
129 NUMBER '0x15' (1, 11) (1, 15)
130 >>> dump_tokens("2134568 != 1231515")
Trent Nelson428de652008-03-18 22:41:35 +0000131 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000132 NUMBER '2134568' (1, 0) (1, 7)
133 OP '!=' (1, 8) (1, 10)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000134 NUMBER '1231515' (1, 11) (1, 18)
135 >>> dump_tokens("(-124561-1) & 200000000")
Trent Nelson428de652008-03-18 22:41:35 +0000136 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000137 OP '(' (1, 0) (1, 1)
138 OP '-' (1, 1) (1, 2)
139 NUMBER '124561' (1, 2) (1, 8)
140 OP '-' (1, 8) (1, 9)
141 NUMBER '1' (1, 9) (1, 10)
142 OP ')' (1, 10) (1, 11)
143 OP '&' (1, 12) (1, 13)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000144 NUMBER '200000000' (1, 14) (1, 23)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000145 >>> dump_tokens("0xdeadbeef != -1")
Trent Nelson428de652008-03-18 22:41:35 +0000146 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000147 NUMBER '0xdeadbeef' (1, 0) (1, 10)
148 OP '!=' (1, 11) (1, 13)
149 OP '-' (1, 14) (1, 15)
150 NUMBER '1' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000151 >>> dump_tokens("0xdeadc0de & 12345")
Trent Nelson428de652008-03-18 22:41:35 +0000152 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000153 NUMBER '0xdeadc0de' (1, 0) (1, 10)
154 OP '&' (1, 11) (1, 12)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000155 NUMBER '12345' (1, 13) (1, 18)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000156 >>> dump_tokens("0xFF & 0x15 | 1234")
Trent Nelson428de652008-03-18 22:41:35 +0000157 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000158 NUMBER '0xFF' (1, 0) (1, 4)
159 OP '&' (1, 5) (1, 6)
160 NUMBER '0x15' (1, 7) (1, 11)
161 OP '|' (1, 12) (1, 13)
162 NUMBER '1234' (1, 14) (1, 18)
163
164Long integers
165
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000166 >>> dump_tokens("x = 0")
Trent Nelson428de652008-03-18 22:41:35 +0000167 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000168 NAME 'x' (1, 0) (1, 1)
169 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000170 NUMBER '0' (1, 4) (1, 5)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000171 >>> dump_tokens("x = 0xfffffffffff")
Trent Nelson428de652008-03-18 22:41:35 +0000172 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000173 NAME 'x' (1, 0) (1, 1)
174 OP '=' (1, 2) (1, 3)
175 NUMBER '0xffffffffff (1, 4) (1, 17)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000176 >>> dump_tokens("x = 123141242151251616110")
Trent Nelson428de652008-03-18 22:41:35 +0000177 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000178 NAME 'x' (1, 0) (1, 1)
179 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000180 NUMBER '123141242151 (1, 4) (1, 25)
181 >>> dump_tokens("x = -15921590215012591")
Trent Nelson428de652008-03-18 22:41:35 +0000182 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000183 NAME 'x' (1, 0) (1, 1)
184 OP '=' (1, 2) (1, 3)
185 OP '-' (1, 4) (1, 5)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000186 NUMBER '159215902150 (1, 5) (1, 22)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000187
188Floating point numbers
189
190 >>> dump_tokens("x = 3.14159")
Trent Nelson428de652008-03-18 22:41:35 +0000191 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000192 NAME 'x' (1, 0) (1, 1)
193 OP '=' (1, 2) (1, 3)
194 NUMBER '3.14159' (1, 4) (1, 11)
195 >>> dump_tokens("x = 314159.")
Trent Nelson428de652008-03-18 22:41:35 +0000196 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000197 NAME 'x' (1, 0) (1, 1)
198 OP '=' (1, 2) (1, 3)
199 NUMBER '314159.' (1, 4) (1, 11)
200 >>> dump_tokens("x = .314159")
Trent Nelson428de652008-03-18 22:41:35 +0000201 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000202 NAME 'x' (1, 0) (1, 1)
203 OP '=' (1, 2) (1, 3)
204 NUMBER '.314159' (1, 4) (1, 11)
205 >>> dump_tokens("x = 3e14159")
Trent Nelson428de652008-03-18 22:41:35 +0000206 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000207 NAME 'x' (1, 0) (1, 1)
208 OP '=' (1, 2) (1, 3)
209 NUMBER '3e14159' (1, 4) (1, 11)
210 >>> dump_tokens("x = 3E123")
Trent Nelson428de652008-03-18 22:41:35 +0000211 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000212 NAME 'x' (1, 0) (1, 1)
213 OP '=' (1, 2) (1, 3)
214 NUMBER '3E123' (1, 4) (1, 9)
215 >>> dump_tokens("x+y = 3e-1230")
Trent Nelson428de652008-03-18 22:41:35 +0000216 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000217 NAME 'x' (1, 0) (1, 1)
218 OP '+' (1, 1) (1, 2)
219 NAME 'y' (1, 2) (1, 3)
220 OP '=' (1, 4) (1, 5)
221 NUMBER '3e-1230' (1, 6) (1, 13)
222 >>> dump_tokens("x = 3.14e159")
Trent Nelson428de652008-03-18 22:41:35 +0000223 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000224 NAME 'x' (1, 0) (1, 1)
225 OP '=' (1, 2) (1, 3)
226 NUMBER '3.14e159' (1, 4) (1, 12)
227
228String literals
229
230 >>> dump_tokens("x = ''; y = \\\"\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000231 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000232 NAME 'x' (1, 0) (1, 1)
233 OP '=' (1, 2) (1, 3)
234 STRING "''" (1, 4) (1, 6)
235 OP ';' (1, 6) (1, 7)
236 NAME 'y' (1, 8) (1, 9)
237 OP '=' (1, 10) (1, 11)
238 STRING '""' (1, 12) (1, 14)
239 >>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000240 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000241 NAME 'x' (1, 0) (1, 1)
242 OP '=' (1, 2) (1, 3)
243 STRING '\\'"\\'' (1, 4) (1, 7)
244 OP ';' (1, 7) (1, 8)
245 NAME 'y' (1, 9) (1, 10)
246 OP '=' (1, 11) (1, 12)
247 STRING '"\\'"' (1, 13) (1, 16)
248 >>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000249 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000250 NAME 'x' (1, 0) (1, 1)
251 OP '=' (1, 2) (1, 3)
252 STRING '"doesn\\'t "' (1, 4) (1, 14)
253 NAME 'shrink' (1, 14) (1, 20)
254 STRING '", does it"' (1, 20) (1, 31)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000255 >>> dump_tokens("x = 'abc' + 'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000256 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000257 NAME 'x' (1, 0) (1, 1)
258 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000259 STRING "'abc'" (1, 4) (1, 9)
260 OP '+' (1, 10) (1, 11)
261 STRING "'ABC'" (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000262 >>> dump_tokens('y = "ABC" + "ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000263 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000264 NAME 'y' (1, 0) (1, 1)
265 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000266 STRING '"ABC"' (1, 4) (1, 9)
267 OP '+' (1, 10) (1, 11)
268 STRING '"ABC"' (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000269 >>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000270 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000271 NAME 'x' (1, 0) (1, 1)
272 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000273 STRING "r'abc'" (1, 4) (1, 10)
274 OP '+' (1, 11) (1, 12)
275 STRING "r'ABC'" (1, 13) (1, 19)
276 OP '+' (1, 20) (1, 21)
277 STRING "R'ABC'" (1, 22) (1, 28)
278 OP '+' (1, 29) (1, 30)
279 STRING "R'ABC'" (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000280 >>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000281 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000282 NAME 'y' (1, 0) (1, 1)
283 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000284 STRING 'r"abc"' (1, 4) (1, 10)
285 OP '+' (1, 11) (1, 12)
286 STRING 'r"ABC"' (1, 13) (1, 19)
287 OP '+' (1, 20) (1, 21)
288 STRING 'R"ABC"' (1, 22) (1, 28)
289 OP '+' (1, 29) (1, 30)
290 STRING 'R"ABC"' (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000291
Meador Inge8d5c0b82012-06-16 21:49:08 -0500292 >>> dump_tokens("u'abc' + U'abc'")
293 ENCODING 'utf-8' (0, 0) (0, 0)
294 STRING "u'abc'" (1, 0) (1, 6)
295 OP '+' (1, 7) (1, 8)
296 STRING "U'abc'" (1, 9) (1, 15)
297 >>> dump_tokens('u"abc" + U"abc"')
298 ENCODING 'utf-8' (0, 0) (0, 0)
299 STRING 'u"abc"' (1, 0) (1, 6)
300 OP '+' (1, 7) (1, 8)
301 STRING 'U"abc"' (1, 9) (1, 15)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500302
303 >>> dump_tokens("b'abc' + B'abc'")
304 ENCODING 'utf-8' (0, 0) (0, 0)
305 STRING "b'abc'" (1, 0) (1, 6)
306 OP '+' (1, 7) (1, 8)
307 STRING "B'abc'" (1, 9) (1, 15)
308 >>> dump_tokens('b"abc" + B"abc"')
309 ENCODING 'utf-8' (0, 0) (0, 0)
310 STRING 'b"abc"' (1, 0) (1, 6)
311 OP '+' (1, 7) (1, 8)
312 STRING 'B"abc"' (1, 9) (1, 15)
313 >>> dump_tokens("br'abc' + bR'abc' + Br'abc' + BR'abc'")
314 ENCODING 'utf-8' (0, 0) (0, 0)
315 STRING "br'abc'" (1, 0) (1, 7)
316 OP '+' (1, 8) (1, 9)
317 STRING "bR'abc'" (1, 10) (1, 17)
318 OP '+' (1, 18) (1, 19)
319 STRING "Br'abc'" (1, 20) (1, 27)
320 OP '+' (1, 28) (1, 29)
321 STRING "BR'abc'" (1, 30) (1, 37)
322 >>> dump_tokens('br"abc" + bR"abc" + Br"abc" + BR"abc"')
323 ENCODING 'utf-8' (0, 0) (0, 0)
324 STRING 'br"abc"' (1, 0) (1, 7)
325 OP '+' (1, 8) (1, 9)
326 STRING 'bR"abc"' (1, 10) (1, 17)
327 OP '+' (1, 18) (1, 19)
328 STRING 'Br"abc"' (1, 20) (1, 27)
329 OP '+' (1, 28) (1, 29)
330 STRING 'BR"abc"' (1, 30) (1, 37)
331 >>> dump_tokens("rb'abc' + rB'abc' + Rb'abc' + RB'abc'")
332 ENCODING 'utf-8' (0, 0) (0, 0)
333 STRING "rb'abc'" (1, 0) (1, 7)
334 OP '+' (1, 8) (1, 9)
335 STRING "rB'abc'" (1, 10) (1, 17)
336 OP '+' (1, 18) (1, 19)
337 STRING "Rb'abc'" (1, 20) (1, 27)
338 OP '+' (1, 28) (1, 29)
339 STRING "RB'abc'" (1, 30) (1, 37)
340 >>> dump_tokens('rb"abc" + rB"abc" + Rb"abc" + RB"abc"')
341 ENCODING 'utf-8' (0, 0) (0, 0)
342 STRING 'rb"abc"' (1, 0) (1, 7)
343 OP '+' (1, 8) (1, 9)
344 STRING 'rB"abc"' (1, 10) (1, 17)
345 OP '+' (1, 18) (1, 19)
346 STRING 'Rb"abc"' (1, 20) (1, 27)
347 OP '+' (1, 28) (1, 29)
348 STRING 'RB"abc"' (1, 30) (1, 37)
349
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000350Operators
351
352 >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000353 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000354 NAME 'def' (1, 0) (1, 3)
355 NAME 'd22' (1, 4) (1, 7)
356 OP '(' (1, 7) (1, 8)
357 NAME 'a' (1, 8) (1, 9)
358 OP ',' (1, 9) (1, 10)
359 NAME 'b' (1, 11) (1, 12)
360 OP ',' (1, 12) (1, 13)
361 NAME 'c' (1, 14) (1, 15)
362 OP '=' (1, 15) (1, 16)
363 NUMBER '2' (1, 16) (1, 17)
364 OP ',' (1, 17) (1, 18)
365 NAME 'd' (1, 19) (1, 20)
366 OP '=' (1, 20) (1, 21)
367 NUMBER '2' (1, 21) (1, 22)
368 OP ',' (1, 22) (1, 23)
369 OP '*' (1, 24) (1, 25)
370 NAME 'k' (1, 25) (1, 26)
371 OP ')' (1, 26) (1, 27)
372 OP ':' (1, 27) (1, 28)
373 NAME 'pass' (1, 29) (1, 33)
374 >>> dump_tokens("def d01v_(a=1, *k, **w): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000375 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000376 NAME 'def' (1, 0) (1, 3)
377 NAME 'd01v_' (1, 4) (1, 9)
378 OP '(' (1, 9) (1, 10)
379 NAME 'a' (1, 10) (1, 11)
380 OP '=' (1, 11) (1, 12)
381 NUMBER '1' (1, 12) (1, 13)
382 OP ',' (1, 13) (1, 14)
383 OP '*' (1, 15) (1, 16)
384 NAME 'k' (1, 16) (1, 17)
385 OP ',' (1, 17) (1, 18)
386 OP '**' (1, 19) (1, 21)
387 NAME 'w' (1, 21) (1, 22)
388 OP ')' (1, 22) (1, 23)
389 OP ':' (1, 23) (1, 24)
390 NAME 'pass' (1, 25) (1, 29)
391
392Comparison
393
394 >>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +
395 ... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")
Trent Nelson428de652008-03-18 22:41:35 +0000396 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000397 NAME 'if' (1, 0) (1, 2)
398 NUMBER '1' (1, 3) (1, 4)
399 OP '<' (1, 5) (1, 6)
400 NUMBER '1' (1, 7) (1, 8)
401 OP '>' (1, 9) (1, 10)
402 NUMBER '1' (1, 11) (1, 12)
403 OP '==' (1, 13) (1, 15)
404 NUMBER '1' (1, 16) (1, 17)
405 OP '>=' (1, 18) (1, 20)
406 NUMBER '5' (1, 21) (1, 22)
407 OP '<=' (1, 23) (1, 25)
408 NUMBER '0x15' (1, 26) (1, 30)
409 OP '<=' (1, 31) (1, 33)
410 NUMBER '0x12' (1, 34) (1, 38)
411 OP '!=' (1, 39) (1, 41)
412 NUMBER '1' (1, 42) (1, 43)
413 NAME 'and' (1, 44) (1, 47)
414 NUMBER '5' (1, 48) (1, 49)
415 NAME 'in' (1, 50) (1, 52)
416 NUMBER '1' (1, 53) (1, 54)
417 NAME 'not' (1, 55) (1, 58)
418 NAME 'in' (1, 59) (1, 61)
419 NUMBER '1' (1, 62) (1, 63)
420 NAME 'is' (1, 64) (1, 66)
421 NUMBER '1' (1, 67) (1, 68)
422 NAME 'or' (1, 69) (1, 71)
423 NUMBER '5' (1, 72) (1, 73)
424 NAME 'is' (1, 74) (1, 76)
425 NAME 'not' (1, 77) (1, 80)
426 NUMBER '1' (1, 81) (1, 82)
427 OP ':' (1, 82) (1, 83)
428 NAME 'pass' (1, 84) (1, 88)
429
430Shift
431
432 >>> dump_tokens("x = 1 << 1 >> 5")
Trent Nelson428de652008-03-18 22:41:35 +0000433 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000434 NAME 'x' (1, 0) (1, 1)
435 OP '=' (1, 2) (1, 3)
436 NUMBER '1' (1, 4) (1, 5)
437 OP '<<' (1, 6) (1, 8)
438 NUMBER '1' (1, 9) (1, 10)
439 OP '>>' (1, 11) (1, 13)
440 NUMBER '5' (1, 14) (1, 15)
441
442Additive
443
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000444 >>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")
Trent Nelson428de652008-03-18 22:41:35 +0000445 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000446 NAME 'x' (1, 0) (1, 1)
447 OP '=' (1, 2) (1, 3)
448 NUMBER '1' (1, 4) (1, 5)
449 OP '-' (1, 6) (1, 7)
450 NAME 'y' (1, 8) (1, 9)
451 OP '+' (1, 10) (1, 11)
452 NUMBER '15' (1, 12) (1, 14)
453 OP '-' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000454 NUMBER '1' (1, 17) (1, 18)
455 OP '+' (1, 19) (1, 20)
456 NUMBER '0x124' (1, 21) (1, 26)
457 OP '+' (1, 27) (1, 28)
458 NAME 'z' (1, 29) (1, 30)
459 OP '+' (1, 31) (1, 32)
460 NAME 'a' (1, 33) (1, 34)
461 OP '[' (1, 34) (1, 35)
462 NUMBER '5' (1, 35) (1, 36)
463 OP ']' (1, 36) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000464
465Multiplicative
466
467 >>> dump_tokens("x = 1//1*1/5*12%0x12")
Trent Nelson428de652008-03-18 22:41:35 +0000468 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000469 NAME 'x' (1, 0) (1, 1)
470 OP '=' (1, 2) (1, 3)
471 NUMBER '1' (1, 4) (1, 5)
472 OP '//' (1, 5) (1, 7)
473 NUMBER '1' (1, 7) (1, 8)
474 OP '*' (1, 8) (1, 9)
475 NUMBER '1' (1, 9) (1, 10)
476 OP '/' (1, 10) (1, 11)
477 NUMBER '5' (1, 11) (1, 12)
478 OP '*' (1, 12) (1, 13)
479 NUMBER '12' (1, 13) (1, 15)
480 OP '%' (1, 15) (1, 16)
481 NUMBER '0x12' (1, 16) (1, 20)
482
483Unary
484
485 >>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")
Trent Nelson428de652008-03-18 22:41:35 +0000486 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000487 OP '~' (1, 0) (1, 1)
488 NUMBER '1' (1, 1) (1, 2)
489 OP '^' (1, 3) (1, 4)
490 NUMBER '1' (1, 5) (1, 6)
491 OP '&' (1, 7) (1, 8)
492 NUMBER '1' (1, 9) (1, 10)
493 OP '|' (1, 11) (1, 12)
494 NUMBER '1' (1, 12) (1, 13)
495 OP '^' (1, 14) (1, 15)
496 OP '-' (1, 16) (1, 17)
497 NUMBER '1' (1, 17) (1, 18)
498 >>> dump_tokens("-1*1/1+1*1//1 - ---1**1")
Trent Nelson428de652008-03-18 22:41:35 +0000499 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000500 OP '-' (1, 0) (1, 1)
501 NUMBER '1' (1, 1) (1, 2)
502 OP '*' (1, 2) (1, 3)
503 NUMBER '1' (1, 3) (1, 4)
504 OP '/' (1, 4) (1, 5)
505 NUMBER '1' (1, 5) (1, 6)
506 OP '+' (1, 6) (1, 7)
507 NUMBER '1' (1, 7) (1, 8)
508 OP '*' (1, 8) (1, 9)
509 NUMBER '1' (1, 9) (1, 10)
510 OP '//' (1, 10) (1, 12)
511 NUMBER '1' (1, 12) (1, 13)
512 OP '-' (1, 14) (1, 15)
513 OP '-' (1, 16) (1, 17)
514 OP '-' (1, 17) (1, 18)
515 OP '-' (1, 18) (1, 19)
516 NUMBER '1' (1, 19) (1, 20)
517 OP '**' (1, 20) (1, 22)
518 NUMBER '1' (1, 22) (1, 23)
519
520Selector
521
522 >>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")
Trent Nelson428de652008-03-18 22:41:35 +0000523 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000524 NAME 'import' (1, 0) (1, 6)
525 NAME 'sys' (1, 7) (1, 10)
526 OP ',' (1, 10) (1, 11)
527 NAME 'time' (1, 12) (1, 16)
528 NEWLINE '\\n' (1, 16) (1, 17)
529 NAME 'x' (2, 0) (2, 1)
530 OP '=' (2, 2) (2, 3)
531 NAME 'sys' (2, 4) (2, 7)
532 OP '.' (2, 7) (2, 8)
533 NAME 'modules' (2, 8) (2, 15)
534 OP '[' (2, 15) (2, 16)
535 STRING "'time'" (2, 16) (2, 22)
536 OP ']' (2, 22) (2, 23)
537 OP '.' (2, 23) (2, 24)
538 NAME 'time' (2, 24) (2, 28)
539 OP '(' (2, 28) (2, 29)
540 OP ')' (2, 29) (2, 30)
541
542Methods
543
544 >>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000545 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000546 OP '@' (1, 0) (1, 1)
547 NAME 'staticmethod (1, 1) (1, 13)
548 NEWLINE '\\n' (1, 13) (1, 14)
549 NAME 'def' (2, 0) (2, 3)
550 NAME 'foo' (2, 4) (2, 7)
551 OP '(' (2, 7) (2, 8)
552 NAME 'x' (2, 8) (2, 9)
553 OP ',' (2, 9) (2, 10)
554 NAME 'y' (2, 10) (2, 11)
555 OP ')' (2, 11) (2, 12)
556 OP ':' (2, 12) (2, 13)
557 NAME 'pass' (2, 14) (2, 18)
558
559Backslash means line continuation, except for comments
560
561 >>> roundtrip("x=1+\\\\n"
562 ... "1\\n"
563 ... "# This is a comment\\\\n"
564 ... "# This also\\n")
565 True
566 >>> roundtrip("# Comment \\\\nx = 0")
567 True
Christian Heimesba4af492008-03-28 00:55:15 +0000568
569Two string literals on the same line
570
571 >>> roundtrip("'' ''")
572 True
573
574Test roundtrip on random python modules.
Antoine Pitrou5bc4fa72010-10-14 15:34:31 +0000575pass the '-ucpu' option to process the full directory.
Christian Heimesba4af492008-03-28 00:55:15 +0000576
577 >>> import random
578 >>> tempdir = os.path.dirname(f) or os.curdir
579 >>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
580
Benjamin Peterson963e4022011-08-13 00:33:21 -0500581tokenize is broken on test_pep3131.py because regular expressions are broken on
582the obscure unicode identifiers in it. *sigh*
583 >>> testfiles.remove(os.path.join(tempdir, "test_pep3131.py"))
Antoine Pitrou5bc4fa72010-10-14 15:34:31 +0000584 >>> if not support.is_resource_enabled("cpu"):
Christian Heimesba4af492008-03-28 00:55:15 +0000585 ... testfiles = random.sample(testfiles, 10)
586 ...
587 >>> for testfile in testfiles:
588 ... if not roundtrip(open(testfile, 'rb')):
589 ... print("Roundtrip failed for file %s" % testfile)
590 ... break
591 ... else: True
592 True
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000593
594Evil tabs
Benjamin Peterson33856de2010-08-30 14:41:20 +0000595
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000596 >>> dump_tokens("def f():\\n\\tif x\\n \\tpass")
597 ENCODING 'utf-8' (0, 0) (0, 0)
598 NAME 'def' (1, 0) (1, 3)
599 NAME 'f' (1, 4) (1, 5)
600 OP '(' (1, 5) (1, 6)
601 OP ')' (1, 6) (1, 7)
602 OP ':' (1, 7) (1, 8)
603 NEWLINE '\\n' (1, 8) (1, 9)
604 INDENT '\\t' (2, 0) (2, 1)
605 NAME 'if' (2, 1) (2, 3)
606 NAME 'x' (2, 4) (2, 5)
607 NEWLINE '\\n' (2, 5) (2, 6)
608 INDENT ' \\t' (3, 0) (3, 9)
609 NAME 'pass' (3, 9) (3, 13)
610 DEDENT '' (4, 0) (4, 0)
611 DEDENT '' (4, 0) (4, 0)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000612
613Non-ascii identifiers
614
615 >>> dump_tokens("Örter = 'places'\\ngrün = 'green'")
616 ENCODING 'utf-8' (0, 0) (0, 0)
617 NAME 'Örter' (1, 0) (1, 5)
618 OP '=' (1, 6) (1, 7)
619 STRING "'places'" (1, 8) (1, 16)
620 NEWLINE '\\n' (1, 16) (1, 17)
621 NAME 'grün' (2, 0) (2, 4)
622 OP '=' (2, 5) (2, 6)
623 STRING "'green'" (2, 7) (2, 14)
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000624
625Legacy unicode literals:
626
Christian Heimes0b3847d2012-06-20 11:17:58 +0200627 >>> dump_tokens("Örter = u'places'\\ngrün = U'green'")
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000628 ENCODING 'utf-8' (0, 0) (0, 0)
629 NAME 'Örter' (1, 0) (1, 5)
630 OP '=' (1, 6) (1, 7)
631 STRING "u'places'" (1, 8) (1, 17)
632 NEWLINE '\\n' (1, 17) (1, 18)
633 NAME 'grün' (2, 0) (2, 4)
634 OP '=' (2, 5) (2, 6)
Christian Heimes0b3847d2012-06-20 11:17:58 +0200635 STRING "U'green'" (2, 7) (2, 15)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000636"""
637
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000638from test import support
Trent Nelson428de652008-03-18 22:41:35 +0000639from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
Meador Inge00c7f852012-01-19 00:44:45 -0600640 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
Victor Stinner58c07522010-11-09 01:08:59 +0000641 open as tokenize_open)
Trent Nelson428de652008-03-18 22:41:35 +0000642from io import BytesIO
643from unittest import TestCase
644import os, sys, glob
Meador Inge00c7f852012-01-19 00:44:45 -0600645import token
Raymond Hettinger68c04532005-06-10 11:05:19 +0000646
Thomas Wouters89f507f2006-12-13 04:49:30 +0000647def dump_tokens(s):
648 """Print out the tokens in s in a table format.
649
650 The ENDMARKER is omitted.
651 """
Trent Nelson428de652008-03-18 22:41:35 +0000652 f = BytesIO(s.encode('utf-8'))
653 for type, token, start, end, line in tokenize(f.readline):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000654 if type == ENDMARKER:
655 break
656 type = tok_name[type]
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000657 print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters89f507f2006-12-13 04:49:30 +0000658
Trent Nelson428de652008-03-18 22:41:35 +0000659def roundtrip(f):
660 """
661 Test roundtrip for `untokenize`. `f` is an open file or a string.
662 The source code in f is tokenized, converted back to source code via
663 tokenize.untokenize(), and tokenized again from the latter. The test
664 fails if the second tokenization doesn't match the first.
665 """
666 if isinstance(f, str):
667 f = BytesIO(f.encode('utf-8'))
Brian Curtin9f5f65c2010-10-30 21:35:28 +0000668 try:
669 token_list = list(tokenize(f.readline))
670 finally:
671 f.close()
Trent Nelson428de652008-03-18 22:41:35 +0000672 tokens1 = [tok[:2] for tok in token_list]
673 new_bytes = untokenize(tokens1)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300674 readline = (line for line in new_bytes.splitlines(keepends=True)).__next__
Trent Nelson428de652008-03-18 22:41:35 +0000675 tokens2 = [tok[:2] for tok in tokenize(readline)]
676 return tokens1 == tokens2
Thomas Wouters89f507f2006-12-13 04:49:30 +0000677
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000678# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000679def decistmt(s):
680 """Substitute Decimals for floats in a string of statements.
681
682 >>> from decimal import Decimal
Georg Brandl88fc6642007-02-09 21:28:07 +0000683 >>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000684 >>> decistmt(s)
Georg Brandl88fc6642007-02-09 21:28:07 +0000685 "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger68c04532005-06-10 11:05:19 +0000686
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000687 The format of the exponent is inherited from the platform C library.
688 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
Mark Dickinson388122d2010-08-04 20:56:28 +0000689 we're only showing 11 digits, and the 12th isn't close to 5, the
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000690 rest of the output should be platform-independent.
691
692 >>> exec(s) #doctest: +ELLIPSIS
Mark Dickinson388122d2010-08-04 20:56:28 +0000693 -3.2171603427...e-0...7
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000694
695 Output from calculations with Decimal should be identical across all
696 platforms.
697
Raymond Hettinger68c04532005-06-10 11:05:19 +0000698 >>> exec(decistmt(s))
699 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +0000700 """
701 result = []
Trent Nelson428de652008-03-18 22:41:35 +0000702 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
Raymond Hettinger68c04532005-06-10 11:05:19 +0000703 for toknum, tokval, _, _, _ in g:
704 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
705 result.extend([
706 (NAME, 'Decimal'),
707 (OP, '('),
708 (STRING, repr(tokval)),
709 (OP, ')')
710 ])
711 else:
712 result.append((toknum, tokval))
Trent Nelson428de652008-03-18 22:41:35 +0000713 return untokenize(result).decode('utf-8')
714
715
716class TestTokenizerAdheresToPep0263(TestCase):
717 """
718 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
719 """
720
721 def _testFile(self, filename):
722 path = os.path.join(os.path.dirname(__file__), filename)
723 return roundtrip(open(path, 'rb'))
724
725 def test_utf8_coding_cookie_and_no_utf8_bom(self):
Ned Deily2ea6fcc2011-07-19 16:15:27 -0700726 f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
Trent Nelson428de652008-03-18 22:41:35 +0000727 self.assertTrue(self._testFile(f))
728
729 def test_latin1_coding_cookie_and_utf8_bom(self):
730 """
731 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
732 allowed encoding for the comment is 'utf-8'. The text file used in
733 this test starts with a BOM signature, but specifies latin1 as the
734 coding, so verify that a SyntaxError is raised, which matches the
735 behaviour of the interpreter when it encounters a similar condition.
736 """
737 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000738 self.assertRaises(SyntaxError, self._testFile, f)
Trent Nelson428de652008-03-18 22:41:35 +0000739
740 def test_no_coding_cookie_and_utf8_bom(self):
741 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
742 self.assertTrue(self._testFile(f))
743
744 def test_utf8_coding_cookie_and_utf8_bom(self):
745 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
746 self.assertTrue(self._testFile(f))
747
Florent Xicluna11f0b412012-07-07 12:13:35 +0200748 def test_bad_coding_cookie(self):
749 self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')
750 self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')
751
Trent Nelson428de652008-03-18 22:41:35 +0000752
753class Test_Tokenize(TestCase):
754
755 def test__tokenize_decodes_with_specified_encoding(self):
756 literal = '"ЉЊЈЁЂ"'
757 line = literal.encode('utf-8')
758 first = False
759 def readline():
760 nonlocal first
761 if not first:
762 first = True
763 return line
764 else:
765 return b''
766
767 # skip the initial encoding token and the end token
768 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
769 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000770 self.assertEqual(tokens, expected_tokens,
771 "bytes not decoded with encoding")
Trent Nelson428de652008-03-18 22:41:35 +0000772
773 def test__tokenize_does_not_decode_with_encoding_none(self):
774 literal = '"ЉЊЈЁЂ"'
775 first = False
776 def readline():
777 nonlocal first
778 if not first:
779 first = True
780 return literal
781 else:
782 return b''
783
784 # skip the end token
785 tokens = list(_tokenize(readline, encoding=None))[:-1]
786 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000787 self.assertEqual(tokens, expected_tokens,
788 "string not tokenized when encoding is None")
Trent Nelson428de652008-03-18 22:41:35 +0000789
790
791class TestDetectEncoding(TestCase):
792
793 def get_readline(self, lines):
794 index = 0
795 def readline():
796 nonlocal index
797 if index == len(lines):
798 raise StopIteration
799 line = lines[index]
800 index += 1
801 return line
802 return readline
803
804 def test_no_bom_no_encoding_cookie(self):
805 lines = (
806 b'# something\n',
807 b'print(something)\n',
808 b'do_something(else)\n'
809 )
810 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000811 self.assertEqual(encoding, 'utf-8')
812 self.assertEqual(consumed_lines, list(lines[:2]))
Trent Nelson428de652008-03-18 22:41:35 +0000813
814 def test_bom_no_cookie(self):
815 lines = (
816 b'\xef\xbb\xbf# something\n',
817 b'print(something)\n',
818 b'do_something(else)\n'
819 )
820 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000821 self.assertEqual(encoding, 'utf-8-sig')
822 self.assertEqual(consumed_lines,
823 [b'# something\n', b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000824
825 def test_cookie_first_line_no_bom(self):
826 lines = (
827 b'# -*- coding: latin-1 -*-\n',
828 b'print(something)\n',
829 b'do_something(else)\n'
830 )
831 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000832 self.assertEqual(encoding, 'iso-8859-1')
833 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000834
835 def test_matched_bom_and_cookie_first_line(self):
836 lines = (
837 b'\xef\xbb\xbf# coding=utf-8\n',
838 b'print(something)\n',
839 b'do_something(else)\n'
840 )
841 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000842 self.assertEqual(encoding, 'utf-8-sig')
843 self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000844
845 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
846 lines = (
847 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
848 b'print(something)\n',
849 b'do_something(else)\n'
850 )
851 readline = self.get_readline(lines)
852 self.assertRaises(SyntaxError, detect_encoding, readline)
853
854 def test_cookie_second_line_no_bom(self):
855 lines = (
856 b'#! something\n',
857 b'# vim: set fileencoding=ascii :\n',
858 b'print(something)\n',
859 b'do_something(else)\n'
860 )
861 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000862 self.assertEqual(encoding, 'ascii')
Trent Nelson428de652008-03-18 22:41:35 +0000863 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
Ezio Melottib3aedd42010-11-20 19:04:17 +0000864 self.assertEqual(consumed_lines, expected)
Trent Nelson428de652008-03-18 22:41:35 +0000865
866 def test_matched_bom_and_cookie_second_line(self):
867 lines = (
868 b'\xef\xbb\xbf#! something\n',
869 b'f# coding=utf-8\n',
870 b'print(something)\n',
871 b'do_something(else)\n'
872 )
873 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000874 self.assertEqual(encoding, 'utf-8-sig')
875 self.assertEqual(consumed_lines,
876 [b'#! something\n', b'f# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000877
878 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
879 lines = (
880 b'\xef\xbb\xbf#! something\n',
881 b'# vim: set fileencoding=ascii :\n',
882 b'print(something)\n',
883 b'do_something(else)\n'
884 )
885 readline = self.get_readline(lines)
886 self.assertRaises(SyntaxError, detect_encoding, readline)
887
Benjamin Petersond3afada2009-10-09 21:43:09 +0000888 def test_latin1_normalization(self):
889 # See get_normal_name() in tokenizer.c.
890 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
891 "iso-8859-1-unix", "iso-latin-1-mac")
892 for encoding in encodings:
893 for rep in ("-", "_"):
894 enc = encoding.replace("-", rep)
895 lines = (b"#!/usr/bin/python\n",
896 b"# coding: " + enc.encode("ascii") + b"\n",
897 b"print(things)\n",
898 b"do_something += 4\n")
899 rl = self.get_readline(lines)
900 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000901 self.assertEqual(found, "iso-8859-1")
Benjamin Petersond3afada2009-10-09 21:43:09 +0000902
Martin v. Löwis63674f42012-04-20 14:36:47 +0200903 def test_syntaxerror_latin1(self):
904 # Issue 14629: need to raise SyntaxError if the first
905 # line(s) have non-UTF-8 characters
906 lines = (
907 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
908 )
909 readline = self.get_readline(lines)
910 self.assertRaises(SyntaxError, detect_encoding, readline)
911
912
Benjamin Petersond3afada2009-10-09 21:43:09 +0000913 def test_utf8_normalization(self):
914 # See get_normal_name() in tokenizer.c.
915 encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
916 for encoding in encodings:
917 for rep in ("-", "_"):
918 enc = encoding.replace("-", rep)
919 lines = (b"#!/usr/bin/python\n",
920 b"# coding: " + enc.encode("ascii") + b"\n",
921 b"1 + 3\n")
922 rl = self.get_readline(lines)
923 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000924 self.assertEqual(found, "utf-8")
Benjamin Petersond3afada2009-10-09 21:43:09 +0000925
Trent Nelson428de652008-03-18 22:41:35 +0000926 def test_short_files(self):
927 readline = self.get_readline((b'print(something)\n',))
928 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000929 self.assertEqual(encoding, 'utf-8')
930 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000931
932 encoding, consumed_lines = detect_encoding(self.get_readline(()))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000933 self.assertEqual(encoding, 'utf-8')
934 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +0000935
936 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
937 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000938 self.assertEqual(encoding, 'utf-8-sig')
939 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000940
941 readline = self.get_readline((b'\xef\xbb\xbf',))
942 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000943 self.assertEqual(encoding, 'utf-8-sig')
944 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +0000945
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000946 readline = self.get_readline((b'# coding: bad\n',))
947 self.assertRaises(SyntaxError, detect_encoding, readline)
Trent Nelson428de652008-03-18 22:41:35 +0000948
Serhiy Storchakadafea852013-09-16 23:51:56 +0300949 def test_false_encoding(self):
950 # Issue 18873: "Encoding" detected in non-comment lines
951 readline = self.get_readline((b'print("#coding=fake")',))
952 encoding, consumed_lines = detect_encoding(readline)
953 self.assertEqual(encoding, 'utf-8')
954 self.assertEqual(consumed_lines, [b'print("#coding=fake")'])
955
Victor Stinner58c07522010-11-09 01:08:59 +0000956 def test_open(self):
957 filename = support.TESTFN + '.py'
958 self.addCleanup(support.unlink, filename)
959
960 # test coding cookie
961 for encoding in ('iso-8859-15', 'utf-8'):
962 with open(filename, 'w', encoding=encoding) as fp:
963 print("# coding: %s" % encoding, file=fp)
964 print("print('euro:\u20ac')", file=fp)
965 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +0000966 self.assertEqual(fp.encoding, encoding)
967 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +0000968
969 # test BOM (no coding cookie)
970 with open(filename, 'w', encoding='utf-8-sig') as fp:
971 print("print('euro:\u20ac')", file=fp)
972 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +0000973 self.assertEqual(fp.encoding, 'utf-8-sig')
974 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +0000975
Brett Cannonc33f3f22012-04-20 13:23:54 -0400976 def test_filename_in_exception(self):
977 # When possible, include the file name in the exception.
978 path = 'some_file_path'
979 lines = (
980 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
981 )
982 class Bunk:
983 def __init__(self, lines, path):
984 self.name = path
985 self._lines = lines
986 self._index = 0
987
988 def readline(self):
989 if self._index == len(lines):
990 raise StopIteration
991 line = lines[self._index]
992 self._index += 1
993 return line
994
995 with self.assertRaises(SyntaxError):
996 ins = Bunk(lines, path)
997 # Make sure lacking a name isn't an issue.
998 del ins.name
999 detect_encoding(ins.readline)
1000 with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
1001 ins = Bunk(lines, path)
1002 detect_encoding(ins.readline)
1003
1004
Trent Nelson428de652008-03-18 22:41:35 +00001005class TestTokenize(TestCase):
1006
1007 def test_tokenize(self):
1008 import tokenize as tokenize_module
1009 encoding = object()
1010 encoding_used = None
1011 def mock_detect_encoding(readline):
1012 return encoding, ['first', 'second']
1013
1014 def mock__tokenize(readline, encoding):
1015 nonlocal encoding_used
1016 encoding_used = encoding
1017 out = []
1018 while True:
1019 next_line = readline()
1020 if next_line:
1021 out.append(next_line)
1022 continue
1023 return out
1024
1025 counter = 0
1026 def mock_readline():
1027 nonlocal counter
1028 counter += 1
1029 if counter == 5:
1030 return b''
1031 return counter
1032
1033 orig_detect_encoding = tokenize_module.detect_encoding
1034 orig__tokenize = tokenize_module._tokenize
1035 tokenize_module.detect_encoding = mock_detect_encoding
1036 tokenize_module._tokenize = mock__tokenize
1037 try:
1038 results = tokenize(mock_readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001039 self.assertEqual(list(results), ['first', 'second', 1, 2, 3, 4])
Trent Nelson428de652008-03-18 22:41:35 +00001040 finally:
1041 tokenize_module.detect_encoding = orig_detect_encoding
1042 tokenize_module._tokenize = orig__tokenize
1043
1044 self.assertTrue(encoding_used, encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +00001045
Meador Inge00c7f852012-01-19 00:44:45 -06001046 def assertExactTypeEqual(self, opstr, *optypes):
1047 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
1048 num_optypes = len(optypes)
1049 self.assertEqual(len(tokens), 2 + num_optypes)
1050 self.assertEqual(token.tok_name[tokens[0].exact_type],
1051 token.tok_name[ENCODING])
1052 for i in range(num_optypes):
1053 self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
1054 token.tok_name[optypes[i]])
1055 self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
1056 token.tok_name[token.ENDMARKER])
1057
1058 def test_exact_type(self):
1059 self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
1060 self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
1061 self.assertExactTypeEqual(':', token.COLON)
1062 self.assertExactTypeEqual(',', token.COMMA)
1063 self.assertExactTypeEqual(';', token.SEMI)
1064 self.assertExactTypeEqual('+', token.PLUS)
1065 self.assertExactTypeEqual('-', token.MINUS)
1066 self.assertExactTypeEqual('*', token.STAR)
1067 self.assertExactTypeEqual('/', token.SLASH)
1068 self.assertExactTypeEqual('|', token.VBAR)
1069 self.assertExactTypeEqual('&', token.AMPER)
1070 self.assertExactTypeEqual('<', token.LESS)
1071 self.assertExactTypeEqual('>', token.GREATER)
1072 self.assertExactTypeEqual('=', token.EQUAL)
1073 self.assertExactTypeEqual('.', token.DOT)
1074 self.assertExactTypeEqual('%', token.PERCENT)
1075 self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
1076 self.assertExactTypeEqual('==', token.EQEQUAL)
1077 self.assertExactTypeEqual('!=', token.NOTEQUAL)
1078 self.assertExactTypeEqual('<=', token.LESSEQUAL)
1079 self.assertExactTypeEqual('>=', token.GREATEREQUAL)
1080 self.assertExactTypeEqual('~', token.TILDE)
1081 self.assertExactTypeEqual('^', token.CIRCUMFLEX)
1082 self.assertExactTypeEqual('<<', token.LEFTSHIFT)
1083 self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
1084 self.assertExactTypeEqual('**', token.DOUBLESTAR)
1085 self.assertExactTypeEqual('+=', token.PLUSEQUAL)
1086 self.assertExactTypeEqual('-=', token.MINEQUAL)
1087 self.assertExactTypeEqual('*=', token.STAREQUAL)
1088 self.assertExactTypeEqual('/=', token.SLASHEQUAL)
1089 self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
1090 self.assertExactTypeEqual('&=', token.AMPEREQUAL)
1091 self.assertExactTypeEqual('|=', token.VBAREQUAL)
1092 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1093 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1094 self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
1095 self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
1096 self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
1097 self.assertExactTypeEqual('//', token.DOUBLESLASH)
1098 self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
1099 self.assertExactTypeEqual('@', token.AT)
1100
1101 self.assertExactTypeEqual('a**2+b**2==c**2',
1102 NAME, token.DOUBLESTAR, NUMBER,
1103 token.PLUS,
1104 NAME, token.DOUBLESTAR, NUMBER,
1105 token.EQEQUAL,
1106 NAME, token.DOUBLESTAR, NUMBER)
1107 self.assertExactTypeEqual('{1, 2, 3}',
1108 token.LBRACE,
1109 token.NUMBER, token.COMMA,
1110 token.NUMBER, token.COMMA,
1111 token.NUMBER,
1112 token.RBRACE)
1113 self.assertExactTypeEqual('^(x & 0x1)',
1114 token.CIRCUMFLEX,
1115 token.LPAR,
1116 token.NAME, token.AMPER, token.NUMBER,
1117 token.RPAR)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001118
Ezio Melottifafa8b72012-11-03 17:46:51 +02001119 def test_pathological_trailing_whitespace(self):
1120 # See http://bugs.python.org/issue16152
1121 self.assertExactTypeEqual('@ ', token.AT)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001122
1123__test__ = {"doctests" : doctests, 'decistmt': decistmt}
1124
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001125def test_main():
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001126 from test import test_tokenize
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001127 support.run_doctest(test_tokenize, True)
1128 support.run_unittest(TestTokenizerAdheresToPep0263)
1129 support.run_unittest(Test_Tokenize)
1130 support.run_unittest(TestDetectEncoding)
1131 support.run_unittest(TestTokenize)
Neal Norwitzc1505362006-12-28 06:47:50 +00001132
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001133if __name__ == "__main__":
1134 test_main()