blob: 1bfac4048ed3d503bf355bd2d5c2877502dad2d7 [file] [log] [blame]
Trent Nelson428de652008-03-18 22:41:35 +00001# -*- coding: utf-8 -*-
2
Christian Heimesdd15f6c2008-03-16 00:07:10 +00003doctests = """
4Tests for the tokenize module.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006The tests can be really simple. Given a small fragment of source
Eric Smith74ca5572008-03-17 19:49:19 +00007code, print out a table with tokens. The ENDMARK is omitted for
Thomas Wouters89f507f2006-12-13 04:49:30 +00008brevity.
9
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010 >>> dump_tokens("1 + 1")
Trent Nelson428de652008-03-18 22:41:35 +000011 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000012 NUMBER '1' (1, 0) (1, 1)
13 OP '+' (1, 2) (1, 3)
14 NUMBER '1' (1, 4) (1, 5)
Thomas Wouters89f507f2006-12-13 04:49:30 +000015
Christian Heimesdd15f6c2008-03-16 00:07:10 +000016 >>> dump_tokens("if False:\\n"
17 ... " # NL\\n"
18 ... " True = False # NEWLINE\\n")
Trent Nelson428de652008-03-18 22:41:35 +000019 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000020 NAME 'if' (1, 0) (1, 2)
21 NAME 'False' (1, 3) (1, 8)
22 OP ':' (1, 8) (1, 9)
23 NEWLINE '\\n' (1, 9) (1, 10)
24 COMMENT '# NL' (2, 4) (2, 8)
25 NL '\\n' (2, 8) (2, 9)
26 INDENT ' ' (3, 0) (3, 4)
27 NAME 'True' (3, 4) (3, 8)
28 OP '=' (3, 9) (3, 10)
29 NAME 'False' (3, 11) (3, 16)
30 COMMENT '# NEWLINE' (3, 17) (3, 26)
31 NEWLINE '\\n' (3, 26) (3, 27)
32 DEDENT '' (4, 0) (4, 0)
Thomas Wouters89f507f2006-12-13 04:49:30 +000033
Christian Heimesdd15f6c2008-03-16 00:07:10 +000034 >>> indent_error_file = \"""
35 ... def k(x):
36 ... x += 2
37 ... x += 5
38 ... \"""
Trent Nelson428de652008-03-18 22:41:35 +000039 >>> readline = BytesIO(indent_error_file.encode('utf-8')).readline
40 >>> for tok in tokenize(readline): pass
Christian Heimesdd15f6c2008-03-16 00:07:10 +000041 Traceback (most recent call last):
42 ...
43 IndentationError: unindent does not match any outer indentation level
Thomas Wouters89f507f2006-12-13 04:49:30 +000044
Christian Heimesdd15f6c2008-03-16 00:07:10 +000045There are some standard formattig practises that are easy to get right.
Thomas Wouters89f507f2006-12-13 04:49:30 +000046
Christian Heimesdd15f6c2008-03-16 00:07:10 +000047 >>> roundtrip("if x == 1:\\n"
48 ... " print(x)\\n")
49 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000050
Christian Heimesdd15f6c2008-03-16 00:07:10 +000051 >>> roundtrip("# This is a comment\\n# This also")
52 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000053
54Some people use different formatting conventions, which makes
Christian Heimesdd15f6c2008-03-16 00:07:10 +000055untokenize a little trickier. Note that this test involves trailing
56whitespace after the colon. Note that we use hex escapes to make the
Trent Nelson428de652008-03-18 22:41:35 +000057two trailing blanks apparent in the expected output.
Thomas Wouters89f507f2006-12-13 04:49:30 +000058
Christian Heimesdd15f6c2008-03-16 00:07:10 +000059 >>> roundtrip("if x == 1 : \\n"
60 ... " print(x)\\n")
61 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000062
Benjamin Petersonee8712c2008-05-20 21:35:26 +000063 >>> f = support.findfile("tokenize_tests.txt")
Trent Nelson428de652008-03-18 22:41:35 +000064 >>> roundtrip(open(f, 'rb'))
Christian Heimesdd15f6c2008-03-16 00:07:10 +000065 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000066
Christian Heimesdd15f6c2008-03-16 00:07:10 +000067 >>> roundtrip("if x == 1:\\n"
68 ... " # A comment by itself.\\n"
69 ... " print(x) # Comment here, too.\\n"
70 ... " # Another comment.\\n"
71 ... "after_if = True\\n")
72 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000073
Christian Heimesdd15f6c2008-03-16 00:07:10 +000074 >>> roundtrip("if (x # The comments need to go in the right place\\n"
75 ... " == 1):\\n"
76 ... " print('x==1')\\n")
77 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000078
Christian Heimesdd15f6c2008-03-16 00:07:10 +000079 >>> roundtrip("class Test: # A comment here\\n"
80 ... " # A comment with weird indent\\n"
81 ... " after_com = 5\\n"
82 ... " def x(m): return m*5 # a one liner\\n"
83 ... " def y(m): # A whitespace after the colon\\n"
84 ... " return y*4 # 3-space indent\\n")
85 True
86
87Some error-handling code
88
89 >>> roundtrip("try: import somemodule\\n"
90 ... "except ImportError: # comment\\n"
Christian Heimesba4af492008-03-28 00:55:15 +000091 ... " print('Can not import' # comment2\\n)"
Neal Norwitz752abd02008-05-13 04:55:24 +000092 ... "else: print('Loaded')\\n")
Christian Heimesdd15f6c2008-03-16 00:07:10 +000093 True
94
Eric Smith74ca5572008-03-17 19:49:19 +000095Balancing continuation
Christian Heimesdd15f6c2008-03-16 00:07:10 +000096
97 >>> roundtrip("a = (3,4, \\n"
98 ... "5,6)\\n"
99 ... "y = [3, 4,\\n"
100 ... "5]\\n"
101 ... "z = {'a': 5,\\n"
102 ... "'b':15, 'c':True}\\n"
103 ... "x = len(y) + 5 - a[\\n"
104 ... "3] - a[2]\\n"
105 ... "+ len(z) - z[\\n"
106 ... "'b']\\n")
107 True
108
109Ordinary integers and binary operators
110
111 >>> dump_tokens("0xff <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000112 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000113 NUMBER '0xff' (1, 0) (1, 4)
114 OP '<=' (1, 5) (1, 7)
115 NUMBER '255' (1, 8) (1, 11)
Eric Smith74ca5572008-03-17 19:49:19 +0000116 >>> dump_tokens("0b10 <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000117 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000118 NUMBER '0b10' (1, 0) (1, 4)
119 OP '<=' (1, 5) (1, 7)
120 NUMBER '255' (1, 8) (1, 11)
121 >>> dump_tokens("0o123 <= 0O123")
Trent Nelson428de652008-03-18 22:41:35 +0000122 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000123 NUMBER '0o123' (1, 0) (1, 5)
124 OP '<=' (1, 6) (1, 8)
125 NUMBER '0O123' (1, 9) (1, 14)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000126 >>> dump_tokens("1234567 > ~0x15")
Trent Nelson428de652008-03-18 22:41:35 +0000127 ENCODING 'utf-8' (0, 0) (0, 0)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000128 NUMBER '1234567' (1, 0) (1, 7)
129 OP '>' (1, 8) (1, 9)
130 OP '~' (1, 10) (1, 11)
131 NUMBER '0x15' (1, 11) (1, 15)
132 >>> dump_tokens("2134568 != 1231515")
Trent Nelson428de652008-03-18 22:41:35 +0000133 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000134 NUMBER '2134568' (1, 0) (1, 7)
135 OP '!=' (1, 8) (1, 10)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000136 NUMBER '1231515' (1, 11) (1, 18)
137 >>> dump_tokens("(-124561-1) & 200000000")
Trent Nelson428de652008-03-18 22:41:35 +0000138 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000139 OP '(' (1, 0) (1, 1)
140 OP '-' (1, 1) (1, 2)
141 NUMBER '124561' (1, 2) (1, 8)
142 OP '-' (1, 8) (1, 9)
143 NUMBER '1' (1, 9) (1, 10)
144 OP ')' (1, 10) (1, 11)
145 OP '&' (1, 12) (1, 13)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000146 NUMBER '200000000' (1, 14) (1, 23)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000147 >>> dump_tokens("0xdeadbeef != -1")
Trent Nelson428de652008-03-18 22:41:35 +0000148 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000149 NUMBER '0xdeadbeef' (1, 0) (1, 10)
150 OP '!=' (1, 11) (1, 13)
151 OP '-' (1, 14) (1, 15)
152 NUMBER '1' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000153 >>> dump_tokens("0xdeadc0de & 12345")
Trent Nelson428de652008-03-18 22:41:35 +0000154 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000155 NUMBER '0xdeadc0de' (1, 0) (1, 10)
156 OP '&' (1, 11) (1, 12)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000157 NUMBER '12345' (1, 13) (1, 18)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000158 >>> dump_tokens("0xFF & 0x15 | 1234")
Trent Nelson428de652008-03-18 22:41:35 +0000159 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000160 NUMBER '0xFF' (1, 0) (1, 4)
161 OP '&' (1, 5) (1, 6)
162 NUMBER '0x15' (1, 7) (1, 11)
163 OP '|' (1, 12) (1, 13)
164 NUMBER '1234' (1, 14) (1, 18)
165
166Long integers
167
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000168 >>> dump_tokens("x = 0")
Trent Nelson428de652008-03-18 22:41:35 +0000169 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000170 NAME 'x' (1, 0) (1, 1)
171 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000172 NUMBER '0' (1, 4) (1, 5)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000173 >>> dump_tokens("x = 0xfffffffffff")
Trent Nelson428de652008-03-18 22:41:35 +0000174 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000175 NAME 'x' (1, 0) (1, 1)
176 OP '=' (1, 2) (1, 3)
177 NUMBER '0xffffffffff (1, 4) (1, 17)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000178 >>> dump_tokens("x = 123141242151251616110")
Trent Nelson428de652008-03-18 22:41:35 +0000179 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000180 NAME 'x' (1, 0) (1, 1)
181 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000182 NUMBER '123141242151 (1, 4) (1, 25)
183 >>> dump_tokens("x = -15921590215012591")
Trent Nelson428de652008-03-18 22:41:35 +0000184 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000185 NAME 'x' (1, 0) (1, 1)
186 OP '=' (1, 2) (1, 3)
187 OP '-' (1, 4) (1, 5)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000188 NUMBER '159215902150 (1, 5) (1, 22)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000189
190Floating point numbers
191
192 >>> dump_tokens("x = 3.14159")
Trent Nelson428de652008-03-18 22:41:35 +0000193 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000194 NAME 'x' (1, 0) (1, 1)
195 OP '=' (1, 2) (1, 3)
196 NUMBER '3.14159' (1, 4) (1, 11)
197 >>> dump_tokens("x = 314159.")
Trent Nelson428de652008-03-18 22:41:35 +0000198 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000199 NAME 'x' (1, 0) (1, 1)
200 OP '=' (1, 2) (1, 3)
201 NUMBER '314159.' (1, 4) (1, 11)
202 >>> dump_tokens("x = .314159")
Trent Nelson428de652008-03-18 22:41:35 +0000203 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000204 NAME 'x' (1, 0) (1, 1)
205 OP '=' (1, 2) (1, 3)
206 NUMBER '.314159' (1, 4) (1, 11)
207 >>> dump_tokens("x = 3e14159")
Trent Nelson428de652008-03-18 22:41:35 +0000208 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000209 NAME 'x' (1, 0) (1, 1)
210 OP '=' (1, 2) (1, 3)
211 NUMBER '3e14159' (1, 4) (1, 11)
212 >>> dump_tokens("x = 3E123")
Trent Nelson428de652008-03-18 22:41:35 +0000213 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000214 NAME 'x' (1, 0) (1, 1)
215 OP '=' (1, 2) (1, 3)
216 NUMBER '3E123' (1, 4) (1, 9)
217 >>> dump_tokens("x+y = 3e-1230")
Trent Nelson428de652008-03-18 22:41:35 +0000218 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000219 NAME 'x' (1, 0) (1, 1)
220 OP '+' (1, 1) (1, 2)
221 NAME 'y' (1, 2) (1, 3)
222 OP '=' (1, 4) (1, 5)
223 NUMBER '3e-1230' (1, 6) (1, 13)
224 >>> dump_tokens("x = 3.14e159")
Trent Nelson428de652008-03-18 22:41:35 +0000225 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000226 NAME 'x' (1, 0) (1, 1)
227 OP '=' (1, 2) (1, 3)
228 NUMBER '3.14e159' (1, 4) (1, 12)
229
230String literals
231
232 >>> dump_tokens("x = ''; y = \\\"\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000233 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000234 NAME 'x' (1, 0) (1, 1)
235 OP '=' (1, 2) (1, 3)
236 STRING "''" (1, 4) (1, 6)
237 OP ';' (1, 6) (1, 7)
238 NAME 'y' (1, 8) (1, 9)
239 OP '=' (1, 10) (1, 11)
240 STRING '""' (1, 12) (1, 14)
241 >>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000242 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000243 NAME 'x' (1, 0) (1, 1)
244 OP '=' (1, 2) (1, 3)
245 STRING '\\'"\\'' (1, 4) (1, 7)
246 OP ';' (1, 7) (1, 8)
247 NAME 'y' (1, 9) (1, 10)
248 OP '=' (1, 11) (1, 12)
249 STRING '"\\'"' (1, 13) (1, 16)
250 >>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000251 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000252 NAME 'x' (1, 0) (1, 1)
253 OP '=' (1, 2) (1, 3)
254 STRING '"doesn\\'t "' (1, 4) (1, 14)
255 NAME 'shrink' (1, 14) (1, 20)
256 STRING '", does it"' (1, 20) (1, 31)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000257 >>> dump_tokens("x = 'abc' + 'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000258 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000259 NAME 'x' (1, 0) (1, 1)
260 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000261 STRING "'abc'" (1, 4) (1, 9)
262 OP '+' (1, 10) (1, 11)
263 STRING "'ABC'" (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000264 >>> dump_tokens('y = "ABC" + "ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000265 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000266 NAME 'y' (1, 0) (1, 1)
267 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000268 STRING '"ABC"' (1, 4) (1, 9)
269 OP '+' (1, 10) (1, 11)
270 STRING '"ABC"' (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000271 >>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000272 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000273 NAME 'x' (1, 0) (1, 1)
274 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000275 STRING "r'abc'" (1, 4) (1, 10)
276 OP '+' (1, 11) (1, 12)
277 STRING "r'ABC'" (1, 13) (1, 19)
278 OP '+' (1, 20) (1, 21)
279 STRING "R'ABC'" (1, 22) (1, 28)
280 OP '+' (1, 29) (1, 30)
281 STRING "R'ABC'" (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000282 >>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000283 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000284 NAME 'y' (1, 0) (1, 1)
285 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000286 STRING 'r"abc"' (1, 4) (1, 10)
287 OP '+' (1, 11) (1, 12)
288 STRING 'r"ABC"' (1, 13) (1, 19)
289 OP '+' (1, 20) (1, 21)
290 STRING 'R"ABC"' (1, 22) (1, 28)
291 OP '+' (1, 29) (1, 30)
292 STRING 'R"ABC"' (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000293
294Operators
295
296 >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000297 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000298 NAME 'def' (1, 0) (1, 3)
299 NAME 'd22' (1, 4) (1, 7)
300 OP '(' (1, 7) (1, 8)
301 NAME 'a' (1, 8) (1, 9)
302 OP ',' (1, 9) (1, 10)
303 NAME 'b' (1, 11) (1, 12)
304 OP ',' (1, 12) (1, 13)
305 NAME 'c' (1, 14) (1, 15)
306 OP '=' (1, 15) (1, 16)
307 NUMBER '2' (1, 16) (1, 17)
308 OP ',' (1, 17) (1, 18)
309 NAME 'd' (1, 19) (1, 20)
310 OP '=' (1, 20) (1, 21)
311 NUMBER '2' (1, 21) (1, 22)
312 OP ',' (1, 22) (1, 23)
313 OP '*' (1, 24) (1, 25)
314 NAME 'k' (1, 25) (1, 26)
315 OP ')' (1, 26) (1, 27)
316 OP ':' (1, 27) (1, 28)
317 NAME 'pass' (1, 29) (1, 33)
318 >>> dump_tokens("def d01v_(a=1, *k, **w): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000319 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000320 NAME 'def' (1, 0) (1, 3)
321 NAME 'd01v_' (1, 4) (1, 9)
322 OP '(' (1, 9) (1, 10)
323 NAME 'a' (1, 10) (1, 11)
324 OP '=' (1, 11) (1, 12)
325 NUMBER '1' (1, 12) (1, 13)
326 OP ',' (1, 13) (1, 14)
327 OP '*' (1, 15) (1, 16)
328 NAME 'k' (1, 16) (1, 17)
329 OP ',' (1, 17) (1, 18)
330 OP '**' (1, 19) (1, 21)
331 NAME 'w' (1, 21) (1, 22)
332 OP ')' (1, 22) (1, 23)
333 OP ':' (1, 23) (1, 24)
334 NAME 'pass' (1, 25) (1, 29)
335
336Comparison
337
338 >>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +
339 ... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")
Trent Nelson428de652008-03-18 22:41:35 +0000340 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000341 NAME 'if' (1, 0) (1, 2)
342 NUMBER '1' (1, 3) (1, 4)
343 OP '<' (1, 5) (1, 6)
344 NUMBER '1' (1, 7) (1, 8)
345 OP '>' (1, 9) (1, 10)
346 NUMBER '1' (1, 11) (1, 12)
347 OP '==' (1, 13) (1, 15)
348 NUMBER '1' (1, 16) (1, 17)
349 OP '>=' (1, 18) (1, 20)
350 NUMBER '5' (1, 21) (1, 22)
351 OP '<=' (1, 23) (1, 25)
352 NUMBER '0x15' (1, 26) (1, 30)
353 OP '<=' (1, 31) (1, 33)
354 NUMBER '0x12' (1, 34) (1, 38)
355 OP '!=' (1, 39) (1, 41)
356 NUMBER '1' (1, 42) (1, 43)
357 NAME 'and' (1, 44) (1, 47)
358 NUMBER '5' (1, 48) (1, 49)
359 NAME 'in' (1, 50) (1, 52)
360 NUMBER '1' (1, 53) (1, 54)
361 NAME 'not' (1, 55) (1, 58)
362 NAME 'in' (1, 59) (1, 61)
363 NUMBER '1' (1, 62) (1, 63)
364 NAME 'is' (1, 64) (1, 66)
365 NUMBER '1' (1, 67) (1, 68)
366 NAME 'or' (1, 69) (1, 71)
367 NUMBER '5' (1, 72) (1, 73)
368 NAME 'is' (1, 74) (1, 76)
369 NAME 'not' (1, 77) (1, 80)
370 NUMBER '1' (1, 81) (1, 82)
371 OP ':' (1, 82) (1, 83)
372 NAME 'pass' (1, 84) (1, 88)
373
374Shift
375
376 >>> dump_tokens("x = 1 << 1 >> 5")
Trent Nelson428de652008-03-18 22:41:35 +0000377 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000378 NAME 'x' (1, 0) (1, 1)
379 OP '=' (1, 2) (1, 3)
380 NUMBER '1' (1, 4) (1, 5)
381 OP '<<' (1, 6) (1, 8)
382 NUMBER '1' (1, 9) (1, 10)
383 OP '>>' (1, 11) (1, 13)
384 NUMBER '5' (1, 14) (1, 15)
385
386Additive
387
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000388 >>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")
Trent Nelson428de652008-03-18 22:41:35 +0000389 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000390 NAME 'x' (1, 0) (1, 1)
391 OP '=' (1, 2) (1, 3)
392 NUMBER '1' (1, 4) (1, 5)
393 OP '-' (1, 6) (1, 7)
394 NAME 'y' (1, 8) (1, 9)
395 OP '+' (1, 10) (1, 11)
396 NUMBER '15' (1, 12) (1, 14)
397 OP '-' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000398 NUMBER '1' (1, 17) (1, 18)
399 OP '+' (1, 19) (1, 20)
400 NUMBER '0x124' (1, 21) (1, 26)
401 OP '+' (1, 27) (1, 28)
402 NAME 'z' (1, 29) (1, 30)
403 OP '+' (1, 31) (1, 32)
404 NAME 'a' (1, 33) (1, 34)
405 OP '[' (1, 34) (1, 35)
406 NUMBER '5' (1, 35) (1, 36)
407 OP ']' (1, 36) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000408
409Multiplicative
410
411 >>> dump_tokens("x = 1//1*1/5*12%0x12")
Trent Nelson428de652008-03-18 22:41:35 +0000412 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000413 NAME 'x' (1, 0) (1, 1)
414 OP '=' (1, 2) (1, 3)
415 NUMBER '1' (1, 4) (1, 5)
416 OP '//' (1, 5) (1, 7)
417 NUMBER '1' (1, 7) (1, 8)
418 OP '*' (1, 8) (1, 9)
419 NUMBER '1' (1, 9) (1, 10)
420 OP '/' (1, 10) (1, 11)
421 NUMBER '5' (1, 11) (1, 12)
422 OP '*' (1, 12) (1, 13)
423 NUMBER '12' (1, 13) (1, 15)
424 OP '%' (1, 15) (1, 16)
425 NUMBER '0x12' (1, 16) (1, 20)
426
427Unary
428
429 >>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")
Trent Nelson428de652008-03-18 22:41:35 +0000430 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000431 OP '~' (1, 0) (1, 1)
432 NUMBER '1' (1, 1) (1, 2)
433 OP '^' (1, 3) (1, 4)
434 NUMBER '1' (1, 5) (1, 6)
435 OP '&' (1, 7) (1, 8)
436 NUMBER '1' (1, 9) (1, 10)
437 OP '|' (1, 11) (1, 12)
438 NUMBER '1' (1, 12) (1, 13)
439 OP '^' (1, 14) (1, 15)
440 OP '-' (1, 16) (1, 17)
441 NUMBER '1' (1, 17) (1, 18)
442 >>> dump_tokens("-1*1/1+1*1//1 - ---1**1")
Trent Nelson428de652008-03-18 22:41:35 +0000443 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000444 OP '-' (1, 0) (1, 1)
445 NUMBER '1' (1, 1) (1, 2)
446 OP '*' (1, 2) (1, 3)
447 NUMBER '1' (1, 3) (1, 4)
448 OP '/' (1, 4) (1, 5)
449 NUMBER '1' (1, 5) (1, 6)
450 OP '+' (1, 6) (1, 7)
451 NUMBER '1' (1, 7) (1, 8)
452 OP '*' (1, 8) (1, 9)
453 NUMBER '1' (1, 9) (1, 10)
454 OP '//' (1, 10) (1, 12)
455 NUMBER '1' (1, 12) (1, 13)
456 OP '-' (1, 14) (1, 15)
457 OP '-' (1, 16) (1, 17)
458 OP '-' (1, 17) (1, 18)
459 OP '-' (1, 18) (1, 19)
460 NUMBER '1' (1, 19) (1, 20)
461 OP '**' (1, 20) (1, 22)
462 NUMBER '1' (1, 22) (1, 23)
463
464Selector
465
466 >>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")
Trent Nelson428de652008-03-18 22:41:35 +0000467 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000468 NAME 'import' (1, 0) (1, 6)
469 NAME 'sys' (1, 7) (1, 10)
470 OP ',' (1, 10) (1, 11)
471 NAME 'time' (1, 12) (1, 16)
472 NEWLINE '\\n' (1, 16) (1, 17)
473 NAME 'x' (2, 0) (2, 1)
474 OP '=' (2, 2) (2, 3)
475 NAME 'sys' (2, 4) (2, 7)
476 OP '.' (2, 7) (2, 8)
477 NAME 'modules' (2, 8) (2, 15)
478 OP '[' (2, 15) (2, 16)
479 STRING "'time'" (2, 16) (2, 22)
480 OP ']' (2, 22) (2, 23)
481 OP '.' (2, 23) (2, 24)
482 NAME 'time' (2, 24) (2, 28)
483 OP '(' (2, 28) (2, 29)
484 OP ')' (2, 29) (2, 30)
485
486Methods
487
488 >>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000489 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000490 OP '@' (1, 0) (1, 1)
491 NAME 'staticmethod (1, 1) (1, 13)
492 NEWLINE '\\n' (1, 13) (1, 14)
493 NAME 'def' (2, 0) (2, 3)
494 NAME 'foo' (2, 4) (2, 7)
495 OP '(' (2, 7) (2, 8)
496 NAME 'x' (2, 8) (2, 9)
497 OP ',' (2, 9) (2, 10)
498 NAME 'y' (2, 10) (2, 11)
499 OP ')' (2, 11) (2, 12)
500 OP ':' (2, 12) (2, 13)
501 NAME 'pass' (2, 14) (2, 18)
502
503Backslash means line continuation, except for comments
504
505 >>> roundtrip("x=1+\\\\n"
506 ... "1\\n"
507 ... "# This is a comment\\\\n"
508 ... "# This also\\n")
509 True
510 >>> roundtrip("# Comment \\\\nx = 0")
511 True
Christian Heimesba4af492008-03-28 00:55:15 +0000512
513Two string literals on the same line
514
515 >>> roundtrip("'' ''")
516 True
517
518Test roundtrip on random python modules.
519pass the '-ucompiler' option to process the full directory.
520
521 >>> import random
522 >>> tempdir = os.path.dirname(f) or os.curdir
523 >>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
524
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000525 >>> if not support.is_resource_enabled("compiler"):
Christian Heimesba4af492008-03-28 00:55:15 +0000526 ... testfiles = random.sample(testfiles, 10)
527 ...
528 >>> for testfile in testfiles:
529 ... if not roundtrip(open(testfile, 'rb')):
530 ... print("Roundtrip failed for file %s" % testfile)
531 ... break
532 ... else: True
533 True
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000534
535Evil tabs
536 >>> dump_tokens("def f():\\n\\tif x\\n \\tpass")
537 ENCODING 'utf-8' (0, 0) (0, 0)
538 NAME 'def' (1, 0) (1, 3)
539 NAME 'f' (1, 4) (1, 5)
540 OP '(' (1, 5) (1, 6)
541 OP ')' (1, 6) (1, 7)
542 OP ':' (1, 7) (1, 8)
543 NEWLINE '\\n' (1, 8) (1, 9)
544 INDENT '\\t' (2, 0) (2, 1)
545 NAME 'if' (2, 1) (2, 3)
546 NAME 'x' (2, 4) (2, 5)
547 NEWLINE '\\n' (2, 5) (2, 6)
548 INDENT ' \\t' (3, 0) (3, 9)
549 NAME 'pass' (3, 9) (3, 13)
550 DEDENT '' (4, 0) (4, 0)
551 DEDENT '' (4, 0) (4, 0)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000552"""
553
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000554from test import support
Trent Nelson428de652008-03-18 22:41:35 +0000555from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
556 STRING, ENDMARKER, tok_name, detect_encoding)
557from io import BytesIO
558from unittest import TestCase
559import os, sys, glob
Raymond Hettinger68c04532005-06-10 11:05:19 +0000560
Thomas Wouters89f507f2006-12-13 04:49:30 +0000561def dump_tokens(s):
562 """Print out the tokens in s in a table format.
563
564 The ENDMARKER is omitted.
565 """
Trent Nelson428de652008-03-18 22:41:35 +0000566 f = BytesIO(s.encode('utf-8'))
567 for type, token, start, end, line in tokenize(f.readline):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000568 if type == ENDMARKER:
569 break
570 type = tok_name[type]
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000571 print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters89f507f2006-12-13 04:49:30 +0000572
Trent Nelson428de652008-03-18 22:41:35 +0000573def roundtrip(f):
574 """
575 Test roundtrip for `untokenize`. `f` is an open file or a string.
576 The source code in f is tokenized, converted back to source code via
577 tokenize.untokenize(), and tokenized again from the latter. The test
578 fails if the second tokenization doesn't match the first.
579 """
580 if isinstance(f, str):
581 f = BytesIO(f.encode('utf-8'))
582 token_list = list(tokenize(f.readline))
583 f.close()
584 tokens1 = [tok[:2] for tok in token_list]
585 new_bytes = untokenize(tokens1)
586 readline = (line for line in new_bytes.splitlines(1)).__next__
587 tokens2 = [tok[:2] for tok in tokenize(readline)]
588 return tokens1 == tokens2
Thomas Wouters89f507f2006-12-13 04:49:30 +0000589
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000590# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000591def decistmt(s):
592 """Substitute Decimals for floats in a string of statements.
593
594 >>> from decimal import Decimal
Georg Brandl88fc6642007-02-09 21:28:07 +0000595 >>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000596 >>> decistmt(s)
Georg Brandl88fc6642007-02-09 21:28:07 +0000597 "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger68c04532005-06-10 11:05:19 +0000598
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000599 The format of the exponent is inherited from the platform C library.
600 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
601 we're only showing 12 digits, and the 13th isn't close to 5, the
602 rest of the output should be platform-independent.
603
604 >>> exec(s) #doctest: +ELLIPSIS
605 -3.21716034272e-0...7
606
607 Output from calculations with Decimal should be identical across all
608 platforms.
609
Raymond Hettinger68c04532005-06-10 11:05:19 +0000610 >>> exec(decistmt(s))
611 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +0000612 """
613 result = []
Trent Nelson428de652008-03-18 22:41:35 +0000614 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
Raymond Hettinger68c04532005-06-10 11:05:19 +0000615 for toknum, tokval, _, _, _ in g:
616 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
617 result.extend([
618 (NAME, 'Decimal'),
619 (OP, '('),
620 (STRING, repr(tokval)),
621 (OP, ')')
622 ])
623 else:
624 result.append((toknum, tokval))
Trent Nelson428de652008-03-18 22:41:35 +0000625 return untokenize(result).decode('utf-8')
626
627
628class TestTokenizerAdheresToPep0263(TestCase):
629 """
630 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
631 """
632
633 def _testFile(self, filename):
634 path = os.path.join(os.path.dirname(__file__), filename)
635 return roundtrip(open(path, 'rb'))
636
637 def test_utf8_coding_cookie_and_no_utf8_bom(self):
638 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
639 self.assertTrue(self._testFile(f))
640
641 def test_latin1_coding_cookie_and_utf8_bom(self):
642 """
643 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
644 allowed encoding for the comment is 'utf-8'. The text file used in
645 this test starts with a BOM signature, but specifies latin1 as the
646 coding, so verify that a SyntaxError is raised, which matches the
647 behaviour of the interpreter when it encounters a similar condition.
648 """
649 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000650 self.assertRaises(SyntaxError, self._testFile, f)
Trent Nelson428de652008-03-18 22:41:35 +0000651
652 def test_no_coding_cookie_and_utf8_bom(self):
653 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
654 self.assertTrue(self._testFile(f))
655
656 def test_utf8_coding_cookie_and_utf8_bom(self):
657 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
658 self.assertTrue(self._testFile(f))
659
660
661class Test_Tokenize(TestCase):
662
663 def test__tokenize_decodes_with_specified_encoding(self):
664 literal = '"ЉЊЈЁЂ"'
665 line = literal.encode('utf-8')
666 first = False
667 def readline():
668 nonlocal first
669 if not first:
670 first = True
671 return line
672 else:
673 return b''
674
675 # skip the initial encoding token and the end token
676 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
677 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
678 self.assertEquals(tokens, expected_tokens,
679 "bytes not decoded with encoding")
680
681 def test__tokenize_does_not_decode_with_encoding_none(self):
682 literal = '"ЉЊЈЁЂ"'
683 first = False
684 def readline():
685 nonlocal first
686 if not first:
687 first = True
688 return literal
689 else:
690 return b''
691
692 # skip the end token
693 tokens = list(_tokenize(readline, encoding=None))[:-1]
694 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
695 self.assertEquals(tokens, expected_tokens,
696 "string not tokenized when encoding is None")
697
698
699class TestDetectEncoding(TestCase):
700
701 def get_readline(self, lines):
702 index = 0
703 def readline():
704 nonlocal index
705 if index == len(lines):
706 raise StopIteration
707 line = lines[index]
708 index += 1
709 return line
710 return readline
711
712 def test_no_bom_no_encoding_cookie(self):
713 lines = (
714 b'# something\n',
715 b'print(something)\n',
716 b'do_something(else)\n'
717 )
718 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
719 self.assertEquals(encoding, 'utf-8')
720 self.assertEquals(consumed_lines, list(lines[:2]))
721
722 def test_bom_no_cookie(self):
723 lines = (
724 b'\xef\xbb\xbf# something\n',
725 b'print(something)\n',
726 b'do_something(else)\n'
727 )
728 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Benjamin Peterson689a5582010-03-18 22:29:52 +0000729 self.assertEquals(encoding, 'utf-8-sig')
Trent Nelson428de652008-03-18 22:41:35 +0000730 self.assertEquals(consumed_lines,
731 [b'# something\n', b'print(something)\n'])
732
733 def test_cookie_first_line_no_bom(self):
734 lines = (
735 b'# -*- coding: latin-1 -*-\n',
736 b'print(something)\n',
737 b'do_something(else)\n'
738 )
739 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Benjamin Petersond3afada2009-10-09 21:43:09 +0000740 self.assertEquals(encoding, 'iso-8859-1')
Trent Nelson428de652008-03-18 22:41:35 +0000741 self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
742
743 def test_matched_bom_and_cookie_first_line(self):
744 lines = (
745 b'\xef\xbb\xbf# coding=utf-8\n',
746 b'print(something)\n',
747 b'do_something(else)\n'
748 )
749 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Benjamin Peterson689a5582010-03-18 22:29:52 +0000750 self.assertEquals(encoding, 'utf-8-sig')
Trent Nelson428de652008-03-18 22:41:35 +0000751 self.assertEquals(consumed_lines, [b'# coding=utf-8\n'])
752
753 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
754 lines = (
755 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
756 b'print(something)\n',
757 b'do_something(else)\n'
758 )
759 readline = self.get_readline(lines)
760 self.assertRaises(SyntaxError, detect_encoding, readline)
761
762 def test_cookie_second_line_no_bom(self):
763 lines = (
764 b'#! something\n',
765 b'# vim: set fileencoding=ascii :\n',
766 b'print(something)\n',
767 b'do_something(else)\n'
768 )
769 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
770 self.assertEquals(encoding, 'ascii')
771 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
772 self.assertEquals(consumed_lines, expected)
773
774 def test_matched_bom_and_cookie_second_line(self):
775 lines = (
776 b'\xef\xbb\xbf#! something\n',
777 b'f# coding=utf-8\n',
778 b'print(something)\n',
779 b'do_something(else)\n'
780 )
781 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Benjamin Peterson689a5582010-03-18 22:29:52 +0000782 self.assertEquals(encoding, 'utf-8-sig')
Trent Nelson428de652008-03-18 22:41:35 +0000783 self.assertEquals(consumed_lines,
784 [b'#! something\n', b'f# coding=utf-8\n'])
785
786 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
787 lines = (
788 b'\xef\xbb\xbf#! something\n',
789 b'# vim: set fileencoding=ascii :\n',
790 b'print(something)\n',
791 b'do_something(else)\n'
792 )
793 readline = self.get_readline(lines)
794 self.assertRaises(SyntaxError, detect_encoding, readline)
795
Benjamin Petersond3afada2009-10-09 21:43:09 +0000796 def test_latin1_normalization(self):
797 # See get_normal_name() in tokenizer.c.
798 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
799 "iso-8859-1-unix", "iso-latin-1-mac")
800 for encoding in encodings:
801 for rep in ("-", "_"):
802 enc = encoding.replace("-", rep)
803 lines = (b"#!/usr/bin/python\n",
804 b"# coding: " + enc.encode("ascii") + b"\n",
805 b"print(things)\n",
806 b"do_something += 4\n")
807 rl = self.get_readline(lines)
808 found, consumed_lines = detect_encoding(rl)
809 self.assertEquals(found, "iso-8859-1")
810
811 def test_utf8_normalization(self):
812 # See get_normal_name() in tokenizer.c.
813 encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
814 for encoding in encodings:
815 for rep in ("-", "_"):
816 enc = encoding.replace("-", rep)
817 lines = (b"#!/usr/bin/python\n",
818 b"# coding: " + enc.encode("ascii") + b"\n",
819 b"1 + 3\n")
820 rl = self.get_readline(lines)
821 found, consumed_lines = detect_encoding(rl)
822 self.assertEquals(found, "utf-8")
823
Trent Nelson428de652008-03-18 22:41:35 +0000824 def test_short_files(self):
825 readline = self.get_readline((b'print(something)\n',))
826 encoding, consumed_lines = detect_encoding(readline)
827 self.assertEquals(encoding, 'utf-8')
828 self.assertEquals(consumed_lines, [b'print(something)\n'])
829
830 encoding, consumed_lines = detect_encoding(self.get_readline(()))
831 self.assertEquals(encoding, 'utf-8')
832 self.assertEquals(consumed_lines, [])
833
834 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
835 encoding, consumed_lines = detect_encoding(readline)
Benjamin Peterson689a5582010-03-18 22:29:52 +0000836 self.assertEquals(encoding, 'utf-8-sig')
Trent Nelson428de652008-03-18 22:41:35 +0000837 self.assertEquals(consumed_lines, [b'print(something)\n'])
838
839 readline = self.get_readline((b'\xef\xbb\xbf',))
840 encoding, consumed_lines = detect_encoding(readline)
Benjamin Peterson689a5582010-03-18 22:29:52 +0000841 self.assertEquals(encoding, 'utf-8-sig')
Trent Nelson428de652008-03-18 22:41:35 +0000842 self.assertEquals(consumed_lines, [])
843
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000844 readline = self.get_readline((b'# coding: bad\n',))
845 self.assertRaises(SyntaxError, detect_encoding, readline)
Trent Nelson428de652008-03-18 22:41:35 +0000846
847class TestTokenize(TestCase):
848
849 def test_tokenize(self):
850 import tokenize as tokenize_module
851 encoding = object()
852 encoding_used = None
853 def mock_detect_encoding(readline):
854 return encoding, ['first', 'second']
855
856 def mock__tokenize(readline, encoding):
857 nonlocal encoding_used
858 encoding_used = encoding
859 out = []
860 while True:
861 next_line = readline()
862 if next_line:
863 out.append(next_line)
864 continue
865 return out
866
867 counter = 0
868 def mock_readline():
869 nonlocal counter
870 counter += 1
871 if counter == 5:
872 return b''
873 return counter
874
875 orig_detect_encoding = tokenize_module.detect_encoding
876 orig__tokenize = tokenize_module._tokenize
877 tokenize_module.detect_encoding = mock_detect_encoding
878 tokenize_module._tokenize = mock__tokenize
879 try:
880 results = tokenize(mock_readline)
881 self.assertEquals(list(results), ['first', 'second', 1, 2, 3, 4])
882 finally:
883 tokenize_module.detect_encoding = orig_detect_encoding
884 tokenize_module._tokenize = orig__tokenize
885
886 self.assertTrue(encoding_used, encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000887
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000888
889__test__ = {"doctests" : doctests, 'decistmt': decistmt}
890
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000891def test_main():
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000892 from test import test_tokenize
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000893 support.run_doctest(test_tokenize, True)
894 support.run_unittest(TestTokenizerAdheresToPep0263)
895 support.run_unittest(Test_Tokenize)
896 support.run_unittest(TestDetectEncoding)
897 support.run_unittest(TestTokenize)
Neal Norwitzc1505362006-12-28 06:47:50 +0000898
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000899if __name__ == "__main__":
900 test_main()