blob: eeefce158d00fb463e6772f778836e669cccddd9 [file] [log] [blame]
Trent Nelson428de652008-03-18 22:41:35 +00001# -*- coding: utf-8 -*-
2
Christian Heimesdd15f6c2008-03-16 00:07:10 +00003doctests = """
4Tests for the tokenize module.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006The tests can be really simple. Given a small fragment of source
Eric Smith74ca5572008-03-17 19:49:19 +00007code, print out a table with tokens. The ENDMARK is omitted for
Thomas Wouters89f507f2006-12-13 04:49:30 +00008brevity.
9
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010 >>> dump_tokens("1 + 1")
Trent Nelson428de652008-03-18 22:41:35 +000011 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000012 NUMBER '1' (1, 0) (1, 1)
13 OP '+' (1, 2) (1, 3)
14 NUMBER '1' (1, 4) (1, 5)
Thomas Wouters89f507f2006-12-13 04:49:30 +000015
Christian Heimesdd15f6c2008-03-16 00:07:10 +000016 >>> dump_tokens("if False:\\n"
17 ... " # NL\\n"
18 ... " True = False # NEWLINE\\n")
Trent Nelson428de652008-03-18 22:41:35 +000019 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000020 NAME 'if' (1, 0) (1, 2)
21 NAME 'False' (1, 3) (1, 8)
22 OP ':' (1, 8) (1, 9)
23 NEWLINE '\\n' (1, 9) (1, 10)
24 COMMENT '# NL' (2, 4) (2, 8)
25 NL '\\n' (2, 8) (2, 9)
26 INDENT ' ' (3, 0) (3, 4)
27 NAME 'True' (3, 4) (3, 8)
28 OP '=' (3, 9) (3, 10)
29 NAME 'False' (3, 11) (3, 16)
30 COMMENT '# NEWLINE' (3, 17) (3, 26)
31 NEWLINE '\\n' (3, 26) (3, 27)
32 DEDENT '' (4, 0) (4, 0)
Thomas Wouters89f507f2006-12-13 04:49:30 +000033
Christian Heimesdd15f6c2008-03-16 00:07:10 +000034 >>> indent_error_file = \"""
35 ... def k(x):
36 ... x += 2
37 ... x += 5
38 ... \"""
Trent Nelson428de652008-03-18 22:41:35 +000039 >>> readline = BytesIO(indent_error_file.encode('utf-8')).readline
40 >>> for tok in tokenize(readline): pass
Christian Heimesdd15f6c2008-03-16 00:07:10 +000041 Traceback (most recent call last):
42 ...
43 IndentationError: unindent does not match any outer indentation level
Thomas Wouters89f507f2006-12-13 04:49:30 +000044
Christian Heimesdd15f6c2008-03-16 00:07:10 +000045There are some standard formattig practises that are easy to get right.
Thomas Wouters89f507f2006-12-13 04:49:30 +000046
Christian Heimesdd15f6c2008-03-16 00:07:10 +000047 >>> roundtrip("if x == 1:\\n"
48 ... " print(x)\\n")
49 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000050
Christian Heimesdd15f6c2008-03-16 00:07:10 +000051 >>> roundtrip("# This is a comment\\n# This also")
52 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000053
54Some people use different formatting conventions, which makes
Christian Heimesdd15f6c2008-03-16 00:07:10 +000055untokenize a little trickier. Note that this test involves trailing
56whitespace after the colon. Note that we use hex escapes to make the
Trent Nelson428de652008-03-18 22:41:35 +000057two trailing blanks apparent in the expected output.
Thomas Wouters89f507f2006-12-13 04:49:30 +000058
Christian Heimesdd15f6c2008-03-16 00:07:10 +000059 >>> roundtrip("if x == 1 : \\n"
60 ... " print(x)\\n")
61 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000062
Benjamin Petersonee8712c2008-05-20 21:35:26 +000063 >>> f = support.findfile("tokenize_tests.txt")
Trent Nelson428de652008-03-18 22:41:35 +000064 >>> roundtrip(open(f, 'rb'))
Christian Heimesdd15f6c2008-03-16 00:07:10 +000065 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000066
Christian Heimesdd15f6c2008-03-16 00:07:10 +000067 >>> roundtrip("if x == 1:\\n"
68 ... " # A comment by itself.\\n"
69 ... " print(x) # Comment here, too.\\n"
70 ... " # Another comment.\\n"
71 ... "after_if = True\\n")
72 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000073
Christian Heimesdd15f6c2008-03-16 00:07:10 +000074 >>> roundtrip("if (x # The comments need to go in the right place\\n"
75 ... " == 1):\\n"
76 ... " print('x==1')\\n")
77 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000078
Christian Heimesdd15f6c2008-03-16 00:07:10 +000079 >>> roundtrip("class Test: # A comment here\\n"
80 ... " # A comment with weird indent\\n"
81 ... " after_com = 5\\n"
82 ... " def x(m): return m*5 # a one liner\\n"
83 ... " def y(m): # A whitespace after the colon\\n"
84 ... " return y*4 # 3-space indent\\n")
85 True
86
87Some error-handling code
88
89 >>> roundtrip("try: import somemodule\\n"
90 ... "except ImportError: # comment\\n"
Christian Heimesba4af492008-03-28 00:55:15 +000091 ... " print('Can not import' # comment2\\n)"
Neal Norwitz752abd02008-05-13 04:55:24 +000092 ... "else: print('Loaded')\\n")
Christian Heimesdd15f6c2008-03-16 00:07:10 +000093 True
94
Eric Smith74ca5572008-03-17 19:49:19 +000095Balancing continuation
Christian Heimesdd15f6c2008-03-16 00:07:10 +000096
97 >>> roundtrip("a = (3,4, \\n"
98 ... "5,6)\\n"
99 ... "y = [3, 4,\\n"
100 ... "5]\\n"
101 ... "z = {'a': 5,\\n"
102 ... "'b':15, 'c':True}\\n"
103 ... "x = len(y) + 5 - a[\\n"
104 ... "3] - a[2]\\n"
105 ... "+ len(z) - z[\\n"
106 ... "'b']\\n")
107 True
108
109Ordinary integers and binary operators
110
111 >>> dump_tokens("0xff <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000112 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000113 NUMBER '0xff' (1, 0) (1, 4)
114 OP '<=' (1, 5) (1, 7)
115 NUMBER '255' (1, 8) (1, 11)
Eric Smith74ca5572008-03-17 19:49:19 +0000116 >>> dump_tokens("0b10 <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000117 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000118 NUMBER '0b10' (1, 0) (1, 4)
119 OP '<=' (1, 5) (1, 7)
120 NUMBER '255' (1, 8) (1, 11)
121 >>> dump_tokens("0o123 <= 0O123")
Trent Nelson428de652008-03-18 22:41:35 +0000122 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000123 NUMBER '0o123' (1, 0) (1, 5)
124 OP '<=' (1, 6) (1, 8)
125 NUMBER '0O123' (1, 9) (1, 14)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000126 >>> dump_tokens("1234567 > ~0x15")
Trent Nelson428de652008-03-18 22:41:35 +0000127 ENCODING 'utf-8' (0, 0) (0, 0)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000128 NUMBER '1234567' (1, 0) (1, 7)
129 OP '>' (1, 8) (1, 9)
130 OP '~' (1, 10) (1, 11)
131 NUMBER '0x15' (1, 11) (1, 15)
132 >>> dump_tokens("2134568 != 1231515")
Trent Nelson428de652008-03-18 22:41:35 +0000133 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000134 NUMBER '2134568' (1, 0) (1, 7)
135 OP '!=' (1, 8) (1, 10)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000136 NUMBER '1231515' (1, 11) (1, 18)
137 >>> dump_tokens("(-124561-1) & 200000000")
Trent Nelson428de652008-03-18 22:41:35 +0000138 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000139 OP '(' (1, 0) (1, 1)
140 OP '-' (1, 1) (1, 2)
141 NUMBER '124561' (1, 2) (1, 8)
142 OP '-' (1, 8) (1, 9)
143 NUMBER '1' (1, 9) (1, 10)
144 OP ')' (1, 10) (1, 11)
145 OP '&' (1, 12) (1, 13)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000146 NUMBER '200000000' (1, 14) (1, 23)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000147 >>> dump_tokens("0xdeadbeef != -1")
Trent Nelson428de652008-03-18 22:41:35 +0000148 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000149 NUMBER '0xdeadbeef' (1, 0) (1, 10)
150 OP '!=' (1, 11) (1, 13)
151 OP '-' (1, 14) (1, 15)
152 NUMBER '1' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000153 >>> dump_tokens("0xdeadc0de & 12345")
Trent Nelson428de652008-03-18 22:41:35 +0000154 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000155 NUMBER '0xdeadc0de' (1, 0) (1, 10)
156 OP '&' (1, 11) (1, 12)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000157 NUMBER '12345' (1, 13) (1, 18)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000158 >>> dump_tokens("0xFF & 0x15 | 1234")
Trent Nelson428de652008-03-18 22:41:35 +0000159 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000160 NUMBER '0xFF' (1, 0) (1, 4)
161 OP '&' (1, 5) (1, 6)
162 NUMBER '0x15' (1, 7) (1, 11)
163 OP '|' (1, 12) (1, 13)
164 NUMBER '1234' (1, 14) (1, 18)
165
166Long integers
167
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000168 >>> dump_tokens("x = 0")
Trent Nelson428de652008-03-18 22:41:35 +0000169 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000170 NAME 'x' (1, 0) (1, 1)
171 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000172 NUMBER '0' (1, 4) (1, 5)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000173 >>> dump_tokens("x = 0xfffffffffff")
Trent Nelson428de652008-03-18 22:41:35 +0000174 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000175 NAME 'x' (1, 0) (1, 1)
176 OP '=' (1, 2) (1, 3)
177 NUMBER '0xffffffffff (1, 4) (1, 17)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000178 >>> dump_tokens("x = 123141242151251616110")
Trent Nelson428de652008-03-18 22:41:35 +0000179 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000180 NAME 'x' (1, 0) (1, 1)
181 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000182 NUMBER '123141242151 (1, 4) (1, 25)
183 >>> dump_tokens("x = -15921590215012591")
Trent Nelson428de652008-03-18 22:41:35 +0000184 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000185 NAME 'x' (1, 0) (1, 1)
186 OP '=' (1, 2) (1, 3)
187 OP '-' (1, 4) (1, 5)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000188 NUMBER '159215902150 (1, 5) (1, 22)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000189
190Floating point numbers
191
192 >>> dump_tokens("x = 3.14159")
Trent Nelson428de652008-03-18 22:41:35 +0000193 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000194 NAME 'x' (1, 0) (1, 1)
195 OP '=' (1, 2) (1, 3)
196 NUMBER '3.14159' (1, 4) (1, 11)
197 >>> dump_tokens("x = 314159.")
Trent Nelson428de652008-03-18 22:41:35 +0000198 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000199 NAME 'x' (1, 0) (1, 1)
200 OP '=' (1, 2) (1, 3)
201 NUMBER '314159.' (1, 4) (1, 11)
202 >>> dump_tokens("x = .314159")
Trent Nelson428de652008-03-18 22:41:35 +0000203 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000204 NAME 'x' (1, 0) (1, 1)
205 OP '=' (1, 2) (1, 3)
206 NUMBER '.314159' (1, 4) (1, 11)
207 >>> dump_tokens("x = 3e14159")
Trent Nelson428de652008-03-18 22:41:35 +0000208 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000209 NAME 'x' (1, 0) (1, 1)
210 OP '=' (1, 2) (1, 3)
211 NUMBER '3e14159' (1, 4) (1, 11)
212 >>> dump_tokens("x = 3E123")
Trent Nelson428de652008-03-18 22:41:35 +0000213 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000214 NAME 'x' (1, 0) (1, 1)
215 OP '=' (1, 2) (1, 3)
216 NUMBER '3E123' (1, 4) (1, 9)
217 >>> dump_tokens("x+y = 3e-1230")
Trent Nelson428de652008-03-18 22:41:35 +0000218 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000219 NAME 'x' (1, 0) (1, 1)
220 OP '+' (1, 1) (1, 2)
221 NAME 'y' (1, 2) (1, 3)
222 OP '=' (1, 4) (1, 5)
223 NUMBER '3e-1230' (1, 6) (1, 13)
224 >>> dump_tokens("x = 3.14e159")
Trent Nelson428de652008-03-18 22:41:35 +0000225 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000226 NAME 'x' (1, 0) (1, 1)
227 OP '=' (1, 2) (1, 3)
228 NUMBER '3.14e159' (1, 4) (1, 12)
229
230String literals
231
232 >>> dump_tokens("x = ''; y = \\\"\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000233 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000234 NAME 'x' (1, 0) (1, 1)
235 OP '=' (1, 2) (1, 3)
236 STRING "''" (1, 4) (1, 6)
237 OP ';' (1, 6) (1, 7)
238 NAME 'y' (1, 8) (1, 9)
239 OP '=' (1, 10) (1, 11)
240 STRING '""' (1, 12) (1, 14)
241 >>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000242 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000243 NAME 'x' (1, 0) (1, 1)
244 OP '=' (1, 2) (1, 3)
245 STRING '\\'"\\'' (1, 4) (1, 7)
246 OP ';' (1, 7) (1, 8)
247 NAME 'y' (1, 9) (1, 10)
248 OP '=' (1, 11) (1, 12)
249 STRING '"\\'"' (1, 13) (1, 16)
250 >>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000251 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000252 NAME 'x' (1, 0) (1, 1)
253 OP '=' (1, 2) (1, 3)
254 STRING '"doesn\\'t "' (1, 4) (1, 14)
255 NAME 'shrink' (1, 14) (1, 20)
256 STRING '", does it"' (1, 20) (1, 31)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000257 >>> dump_tokens("x = 'abc' + 'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000258 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000259 NAME 'x' (1, 0) (1, 1)
260 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000261 STRING "'abc'" (1, 4) (1, 9)
262 OP '+' (1, 10) (1, 11)
263 STRING "'ABC'" (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000264 >>> dump_tokens('y = "ABC" + "ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000265 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000266 NAME 'y' (1, 0) (1, 1)
267 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000268 STRING '"ABC"' (1, 4) (1, 9)
269 OP '+' (1, 10) (1, 11)
270 STRING '"ABC"' (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000271 >>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000272 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000273 NAME 'x' (1, 0) (1, 1)
274 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000275 STRING "r'abc'" (1, 4) (1, 10)
276 OP '+' (1, 11) (1, 12)
277 STRING "r'ABC'" (1, 13) (1, 19)
278 OP '+' (1, 20) (1, 21)
279 STRING "R'ABC'" (1, 22) (1, 28)
280 OP '+' (1, 29) (1, 30)
281 STRING "R'ABC'" (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000282 >>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000283 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000284 NAME 'y' (1, 0) (1, 1)
285 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000286 STRING 'r"abc"' (1, 4) (1, 10)
287 OP '+' (1, 11) (1, 12)
288 STRING 'r"ABC"' (1, 13) (1, 19)
289 OP '+' (1, 20) (1, 21)
290 STRING 'R"ABC"' (1, 22) (1, 28)
291 OP '+' (1, 29) (1, 30)
292 STRING 'R"ABC"' (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000293
294Operators
295
296 >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000297 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000298 NAME 'def' (1, 0) (1, 3)
299 NAME 'd22' (1, 4) (1, 7)
300 OP '(' (1, 7) (1, 8)
301 NAME 'a' (1, 8) (1, 9)
302 OP ',' (1, 9) (1, 10)
303 NAME 'b' (1, 11) (1, 12)
304 OP ',' (1, 12) (1, 13)
305 NAME 'c' (1, 14) (1, 15)
306 OP '=' (1, 15) (1, 16)
307 NUMBER '2' (1, 16) (1, 17)
308 OP ',' (1, 17) (1, 18)
309 NAME 'd' (1, 19) (1, 20)
310 OP '=' (1, 20) (1, 21)
311 NUMBER '2' (1, 21) (1, 22)
312 OP ',' (1, 22) (1, 23)
313 OP '*' (1, 24) (1, 25)
314 NAME 'k' (1, 25) (1, 26)
315 OP ')' (1, 26) (1, 27)
316 OP ':' (1, 27) (1, 28)
317 NAME 'pass' (1, 29) (1, 33)
318 >>> dump_tokens("def d01v_(a=1, *k, **w): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000319 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000320 NAME 'def' (1, 0) (1, 3)
321 NAME 'd01v_' (1, 4) (1, 9)
322 OP '(' (1, 9) (1, 10)
323 NAME 'a' (1, 10) (1, 11)
324 OP '=' (1, 11) (1, 12)
325 NUMBER '1' (1, 12) (1, 13)
326 OP ',' (1, 13) (1, 14)
327 OP '*' (1, 15) (1, 16)
328 NAME 'k' (1, 16) (1, 17)
329 OP ',' (1, 17) (1, 18)
330 OP '**' (1, 19) (1, 21)
331 NAME 'w' (1, 21) (1, 22)
332 OP ')' (1, 22) (1, 23)
333 OP ':' (1, 23) (1, 24)
334 NAME 'pass' (1, 25) (1, 29)
335
336Comparison
337
338 >>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +
339 ... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")
Trent Nelson428de652008-03-18 22:41:35 +0000340 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000341 NAME 'if' (1, 0) (1, 2)
342 NUMBER '1' (1, 3) (1, 4)
343 OP '<' (1, 5) (1, 6)
344 NUMBER '1' (1, 7) (1, 8)
345 OP '>' (1, 9) (1, 10)
346 NUMBER '1' (1, 11) (1, 12)
347 OP '==' (1, 13) (1, 15)
348 NUMBER '1' (1, 16) (1, 17)
349 OP '>=' (1, 18) (1, 20)
350 NUMBER '5' (1, 21) (1, 22)
351 OP '<=' (1, 23) (1, 25)
352 NUMBER '0x15' (1, 26) (1, 30)
353 OP '<=' (1, 31) (1, 33)
354 NUMBER '0x12' (1, 34) (1, 38)
355 OP '!=' (1, 39) (1, 41)
356 NUMBER '1' (1, 42) (1, 43)
357 NAME 'and' (1, 44) (1, 47)
358 NUMBER '5' (1, 48) (1, 49)
359 NAME 'in' (1, 50) (1, 52)
360 NUMBER '1' (1, 53) (1, 54)
361 NAME 'not' (1, 55) (1, 58)
362 NAME 'in' (1, 59) (1, 61)
363 NUMBER '1' (1, 62) (1, 63)
364 NAME 'is' (1, 64) (1, 66)
365 NUMBER '1' (1, 67) (1, 68)
366 NAME 'or' (1, 69) (1, 71)
367 NUMBER '5' (1, 72) (1, 73)
368 NAME 'is' (1, 74) (1, 76)
369 NAME 'not' (1, 77) (1, 80)
370 NUMBER '1' (1, 81) (1, 82)
371 OP ':' (1, 82) (1, 83)
372 NAME 'pass' (1, 84) (1, 88)
373
374Shift
375
376 >>> dump_tokens("x = 1 << 1 >> 5")
Trent Nelson428de652008-03-18 22:41:35 +0000377 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000378 NAME 'x' (1, 0) (1, 1)
379 OP '=' (1, 2) (1, 3)
380 NUMBER '1' (1, 4) (1, 5)
381 OP '<<' (1, 6) (1, 8)
382 NUMBER '1' (1, 9) (1, 10)
383 OP '>>' (1, 11) (1, 13)
384 NUMBER '5' (1, 14) (1, 15)
385
386Additive
387
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000388 >>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")
Trent Nelson428de652008-03-18 22:41:35 +0000389 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000390 NAME 'x' (1, 0) (1, 1)
391 OP '=' (1, 2) (1, 3)
392 NUMBER '1' (1, 4) (1, 5)
393 OP '-' (1, 6) (1, 7)
394 NAME 'y' (1, 8) (1, 9)
395 OP '+' (1, 10) (1, 11)
396 NUMBER '15' (1, 12) (1, 14)
397 OP '-' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000398 NUMBER '1' (1, 17) (1, 18)
399 OP '+' (1, 19) (1, 20)
400 NUMBER '0x124' (1, 21) (1, 26)
401 OP '+' (1, 27) (1, 28)
402 NAME 'z' (1, 29) (1, 30)
403 OP '+' (1, 31) (1, 32)
404 NAME 'a' (1, 33) (1, 34)
405 OP '[' (1, 34) (1, 35)
406 NUMBER '5' (1, 35) (1, 36)
407 OP ']' (1, 36) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000408
409Multiplicative
410
411 >>> dump_tokens("x = 1//1*1/5*12%0x12")
Trent Nelson428de652008-03-18 22:41:35 +0000412 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000413 NAME 'x' (1, 0) (1, 1)
414 OP '=' (1, 2) (1, 3)
415 NUMBER '1' (1, 4) (1, 5)
416 OP '//' (1, 5) (1, 7)
417 NUMBER '1' (1, 7) (1, 8)
418 OP '*' (1, 8) (1, 9)
419 NUMBER '1' (1, 9) (1, 10)
420 OP '/' (1, 10) (1, 11)
421 NUMBER '5' (1, 11) (1, 12)
422 OP '*' (1, 12) (1, 13)
423 NUMBER '12' (1, 13) (1, 15)
424 OP '%' (1, 15) (1, 16)
425 NUMBER '0x12' (1, 16) (1, 20)
426
427Unary
428
429 >>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")
Trent Nelson428de652008-03-18 22:41:35 +0000430 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000431 OP '~' (1, 0) (1, 1)
432 NUMBER '1' (1, 1) (1, 2)
433 OP '^' (1, 3) (1, 4)
434 NUMBER '1' (1, 5) (1, 6)
435 OP '&' (1, 7) (1, 8)
436 NUMBER '1' (1, 9) (1, 10)
437 OP '|' (1, 11) (1, 12)
438 NUMBER '1' (1, 12) (1, 13)
439 OP '^' (1, 14) (1, 15)
440 OP '-' (1, 16) (1, 17)
441 NUMBER '1' (1, 17) (1, 18)
442 >>> dump_tokens("-1*1/1+1*1//1 - ---1**1")
Trent Nelson428de652008-03-18 22:41:35 +0000443 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000444 OP '-' (1, 0) (1, 1)
445 NUMBER '1' (1, 1) (1, 2)
446 OP '*' (1, 2) (1, 3)
447 NUMBER '1' (1, 3) (1, 4)
448 OP '/' (1, 4) (1, 5)
449 NUMBER '1' (1, 5) (1, 6)
450 OP '+' (1, 6) (1, 7)
451 NUMBER '1' (1, 7) (1, 8)
452 OP '*' (1, 8) (1, 9)
453 NUMBER '1' (1, 9) (1, 10)
454 OP '//' (1, 10) (1, 12)
455 NUMBER '1' (1, 12) (1, 13)
456 OP '-' (1, 14) (1, 15)
457 OP '-' (1, 16) (1, 17)
458 OP '-' (1, 17) (1, 18)
459 OP '-' (1, 18) (1, 19)
460 NUMBER '1' (1, 19) (1, 20)
461 OP '**' (1, 20) (1, 22)
462 NUMBER '1' (1, 22) (1, 23)
463
464Selector
465
466 >>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")
Trent Nelson428de652008-03-18 22:41:35 +0000467 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000468 NAME 'import' (1, 0) (1, 6)
469 NAME 'sys' (1, 7) (1, 10)
470 OP ',' (1, 10) (1, 11)
471 NAME 'time' (1, 12) (1, 16)
472 NEWLINE '\\n' (1, 16) (1, 17)
473 NAME 'x' (2, 0) (2, 1)
474 OP '=' (2, 2) (2, 3)
475 NAME 'sys' (2, 4) (2, 7)
476 OP '.' (2, 7) (2, 8)
477 NAME 'modules' (2, 8) (2, 15)
478 OP '[' (2, 15) (2, 16)
479 STRING "'time'" (2, 16) (2, 22)
480 OP ']' (2, 22) (2, 23)
481 OP '.' (2, 23) (2, 24)
482 NAME 'time' (2, 24) (2, 28)
483 OP '(' (2, 28) (2, 29)
484 OP ')' (2, 29) (2, 30)
485
486Methods
487
488 >>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000489 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000490 OP '@' (1, 0) (1, 1)
491 NAME 'staticmethod (1, 1) (1, 13)
492 NEWLINE '\\n' (1, 13) (1, 14)
493 NAME 'def' (2, 0) (2, 3)
494 NAME 'foo' (2, 4) (2, 7)
495 OP '(' (2, 7) (2, 8)
496 NAME 'x' (2, 8) (2, 9)
497 OP ',' (2, 9) (2, 10)
498 NAME 'y' (2, 10) (2, 11)
499 OP ')' (2, 11) (2, 12)
500 OP ':' (2, 12) (2, 13)
501 NAME 'pass' (2, 14) (2, 18)
502
503Backslash means line continuation, except for comments
504
505 >>> roundtrip("x=1+\\\\n"
506 ... "1\\n"
507 ... "# This is a comment\\\\n"
508 ... "# This also\\n")
509 True
510 >>> roundtrip("# Comment \\\\nx = 0")
511 True
Christian Heimesba4af492008-03-28 00:55:15 +0000512
513Two string literals on the same line
514
515 >>> roundtrip("'' ''")
516 True
517
518Test roundtrip on random python modules.
519pass the '-ucompiler' option to process the full directory.
520
521 >>> import random
522 >>> tempdir = os.path.dirname(f) or os.curdir
523 >>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
524
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000525 >>> if not support.is_resource_enabled("compiler"):
Christian Heimesba4af492008-03-28 00:55:15 +0000526 ... testfiles = random.sample(testfiles, 10)
527 ...
528 >>> for testfile in testfiles:
529 ... if not roundtrip(open(testfile, 'rb')):
530 ... print("Roundtrip failed for file %s" % testfile)
531 ... break
532 ... else: True
533 True
Benjamin Peterson8f6713f2009-11-13 02:29:35 +0000534
535Evil tabs
Benjamin Peterson66428b22010-08-30 14:44:53 +0000536
Benjamin Peterson8f6713f2009-11-13 02:29:35 +0000537 >>> dump_tokens("def f():\\n\\tif x\\n \\tpass")
538 ENCODING 'utf-8' (0, 0) (0, 0)
539 NAME 'def' (1, 0) (1, 3)
540 NAME 'f' (1, 4) (1, 5)
541 OP '(' (1, 5) (1, 6)
542 OP ')' (1, 6) (1, 7)
543 OP ':' (1, 7) (1, 8)
544 NEWLINE '\\n' (1, 8) (1, 9)
545 INDENT '\\t' (2, 0) (2, 1)
546 NAME 'if' (2, 1) (2, 3)
547 NAME 'x' (2, 4) (2, 5)
548 NEWLINE '\\n' (2, 5) (2, 6)
549 INDENT ' \\t' (3, 0) (3, 9)
550 NAME 'pass' (3, 9) (3, 13)
551 DEDENT '' (4, 0) (4, 0)
552 DEDENT '' (4, 0) (4, 0)
Benjamin Peterson66428b22010-08-30 14:44:53 +0000553
554Non-ascii identifiers
555
556 >>> dump_tokens("Örter = 'places'\\ngrün = 'green'")
557 ENCODING 'utf-8' (0, 0) (0, 0)
558 NAME 'Örter' (1, 0) (1, 5)
559 OP '=' (1, 6) (1, 7)
560 STRING "'places'" (1, 8) (1, 16)
561 NEWLINE '\\n' (1, 16) (1, 17)
562 NAME 'grün' (2, 0) (2, 4)
563 OP '=' (2, 5) (2, 6)
564 STRING "'green'" (2, 7) (2, 14)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000565"""
566
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000567from test import support
Trent Nelson428de652008-03-18 22:41:35 +0000568from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
569 STRING, ENDMARKER, tok_name, detect_encoding)
570from io import BytesIO
571from unittest import TestCase
572import os, sys, glob
Raymond Hettinger68c04532005-06-10 11:05:19 +0000573
Thomas Wouters89f507f2006-12-13 04:49:30 +0000574def dump_tokens(s):
575 """Print out the tokens in s in a table format.
576
577 The ENDMARKER is omitted.
578 """
Trent Nelson428de652008-03-18 22:41:35 +0000579 f = BytesIO(s.encode('utf-8'))
580 for type, token, start, end, line in tokenize(f.readline):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000581 if type == ENDMARKER:
582 break
583 type = tok_name[type]
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000584 print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters89f507f2006-12-13 04:49:30 +0000585
Trent Nelson428de652008-03-18 22:41:35 +0000586def roundtrip(f):
587 """
588 Test roundtrip for `untokenize`. `f` is an open file or a string.
589 The source code in f is tokenized, converted back to source code via
590 tokenize.untokenize(), and tokenized again from the latter. The test
591 fails if the second tokenization doesn't match the first.
592 """
593 if isinstance(f, str):
594 f = BytesIO(f.encode('utf-8'))
595 token_list = list(tokenize(f.readline))
596 f.close()
597 tokens1 = [tok[:2] for tok in token_list]
598 new_bytes = untokenize(tokens1)
599 readline = (line for line in new_bytes.splitlines(1)).__next__
600 tokens2 = [tok[:2] for tok in tokenize(readline)]
601 return tokens1 == tokens2
Thomas Wouters89f507f2006-12-13 04:49:30 +0000602
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000603# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000604def decistmt(s):
605 """Substitute Decimals for floats in a string of statements.
606
607 >>> from decimal import Decimal
Georg Brandl88fc6642007-02-09 21:28:07 +0000608 >>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000609 >>> decistmt(s)
Georg Brandl88fc6642007-02-09 21:28:07 +0000610 "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger68c04532005-06-10 11:05:19 +0000611
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000612 The format of the exponent is inherited from the platform C library.
613 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
614 we're only showing 12 digits, and the 13th isn't close to 5, the
615 rest of the output should be platform-independent.
616
617 >>> exec(s) #doctest: +ELLIPSIS
618 -3.21716034272e-0...7
619
620 Output from calculations with Decimal should be identical across all
621 platforms.
622
Raymond Hettinger68c04532005-06-10 11:05:19 +0000623 >>> exec(decistmt(s))
624 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +0000625 """
626 result = []
Trent Nelson428de652008-03-18 22:41:35 +0000627 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
Raymond Hettinger68c04532005-06-10 11:05:19 +0000628 for toknum, tokval, _, _, _ in g:
629 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
630 result.extend([
631 (NAME, 'Decimal'),
632 (OP, '('),
633 (STRING, repr(tokval)),
634 (OP, ')')
635 ])
636 else:
637 result.append((toknum, tokval))
Trent Nelson428de652008-03-18 22:41:35 +0000638 return untokenize(result).decode('utf-8')
639
640
641class TestTokenizerAdheresToPep0263(TestCase):
642 """
643 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
644 """
645
646 def _testFile(self, filename):
647 path = os.path.join(os.path.dirname(__file__), filename)
648 return roundtrip(open(path, 'rb'))
649
650 def test_utf8_coding_cookie_and_no_utf8_bom(self):
651 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
652 self.assertTrue(self._testFile(f))
653
654 def test_latin1_coding_cookie_and_utf8_bom(self):
655 """
656 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
657 allowed encoding for the comment is 'utf-8'. The text file used in
658 this test starts with a BOM signature, but specifies latin1 as the
659 coding, so verify that a SyntaxError is raised, which matches the
660 behaviour of the interpreter when it encounters a similar condition.
661 """
662 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
Georg Brandlab91fde2009-08-13 08:51:18 +0000663 self.assertRaises(SyntaxError, self._testFile, f)
Trent Nelson428de652008-03-18 22:41:35 +0000664
665 def test_no_coding_cookie_and_utf8_bom(self):
666 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
667 self.assertTrue(self._testFile(f))
668
669 def test_utf8_coding_cookie_and_utf8_bom(self):
670 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
671 self.assertTrue(self._testFile(f))
672
673
674class Test_Tokenize(TestCase):
675
676 def test__tokenize_decodes_with_specified_encoding(self):
677 literal = '"ЉЊЈЁЂ"'
678 line = literal.encode('utf-8')
679 first = False
680 def readline():
681 nonlocal first
682 if not first:
683 first = True
684 return line
685 else:
686 return b''
687
688 # skip the initial encoding token and the end token
689 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
690 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
691 self.assertEquals(tokens, expected_tokens,
692 "bytes not decoded with encoding")
693
694 def test__tokenize_does_not_decode_with_encoding_none(self):
695 literal = '"ЉЊЈЁЂ"'
696 first = False
697 def readline():
698 nonlocal first
699 if not first:
700 first = True
701 return literal
702 else:
703 return b''
704
705 # skip the end token
706 tokens = list(_tokenize(readline, encoding=None))[:-1]
707 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
708 self.assertEquals(tokens, expected_tokens,
709 "string not tokenized when encoding is None")
710
711
712class TestDetectEncoding(TestCase):
713
714 def get_readline(self, lines):
715 index = 0
716 def readline():
717 nonlocal index
718 if index == len(lines):
719 raise StopIteration
720 line = lines[index]
721 index += 1
722 return line
723 return readline
724
725 def test_no_bom_no_encoding_cookie(self):
726 lines = (
727 b'# something\n',
728 b'print(something)\n',
729 b'do_something(else)\n'
730 )
731 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
732 self.assertEquals(encoding, 'utf-8')
733 self.assertEquals(consumed_lines, list(lines[:2]))
734
735 def test_bom_no_cookie(self):
736 lines = (
737 b'\xef\xbb\xbf# something\n',
738 b'print(something)\n',
739 b'do_something(else)\n'
740 )
741 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
742 self.assertEquals(encoding, 'utf-8')
743 self.assertEquals(consumed_lines,
744 [b'# something\n', b'print(something)\n'])
745
746 def test_cookie_first_line_no_bom(self):
747 lines = (
748 b'# -*- coding: latin-1 -*-\n',
749 b'print(something)\n',
750 b'do_something(else)\n'
751 )
752 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Benjamin Peterson0c7f9c92009-10-09 21:53:27 +0000753 self.assertEquals(encoding, 'iso-8859-1')
Trent Nelson428de652008-03-18 22:41:35 +0000754 self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
755
756 def test_matched_bom_and_cookie_first_line(self):
757 lines = (
758 b'\xef\xbb\xbf# coding=utf-8\n',
759 b'print(something)\n',
760 b'do_something(else)\n'
761 )
762 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
763 self.assertEquals(encoding, 'utf-8')
764 self.assertEquals(consumed_lines, [b'# coding=utf-8\n'])
765
766 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
767 lines = (
768 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
769 b'print(something)\n',
770 b'do_something(else)\n'
771 )
772 readline = self.get_readline(lines)
773 self.assertRaises(SyntaxError, detect_encoding, readline)
774
775 def test_cookie_second_line_no_bom(self):
776 lines = (
777 b'#! something\n',
778 b'# vim: set fileencoding=ascii :\n',
779 b'print(something)\n',
780 b'do_something(else)\n'
781 )
782 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
783 self.assertEquals(encoding, 'ascii')
784 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
785 self.assertEquals(consumed_lines, expected)
786
787 def test_matched_bom_and_cookie_second_line(self):
788 lines = (
789 b'\xef\xbb\xbf#! something\n',
790 b'f# coding=utf-8\n',
791 b'print(something)\n',
792 b'do_something(else)\n'
793 )
794 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
795 self.assertEquals(encoding, 'utf-8')
796 self.assertEquals(consumed_lines,
797 [b'#! something\n', b'f# coding=utf-8\n'])
798
799 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
800 lines = (
801 b'\xef\xbb\xbf#! something\n',
802 b'# vim: set fileencoding=ascii :\n',
803 b'print(something)\n',
804 b'do_something(else)\n'
805 )
806 readline = self.get_readline(lines)
807 self.assertRaises(SyntaxError, detect_encoding, readline)
808
Benjamin Peterson0c7f9c92009-10-09 21:53:27 +0000809 def test_latin1_normalization(self):
810 # See get_normal_name() in tokenizer.c.
811 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
812 "iso-8859-1-unix", "iso-latin-1-mac")
813 for encoding in encodings:
814 for rep in ("-", "_"):
815 enc = encoding.replace("-", rep)
816 lines = (b"#!/usr/bin/python\n",
817 b"# coding: " + enc.encode("ascii") + b"\n",
818 b"print(things)\n",
819 b"do_something += 4\n")
820 rl = self.get_readline(lines)
821 found, consumed_lines = detect_encoding(rl)
822 self.assertEquals(found, "iso-8859-1")
823
824 def test_utf8_normalization(self):
825 # See get_normal_name() in tokenizer.c.
826 encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
827 for encoding in encodings:
828 for rep in ("-", "_"):
829 enc = encoding.replace("-", rep)
830 lines = (b"#!/usr/bin/python\n",
831 b"# coding: " + enc.encode("ascii") + b"\n",
832 b"1 + 3\n")
833 rl = self.get_readline(lines)
834 found, consumed_lines = detect_encoding(rl)
835 self.assertEquals(found, "utf-8")
836
Trent Nelson428de652008-03-18 22:41:35 +0000837 def test_short_files(self):
838 readline = self.get_readline((b'print(something)\n',))
839 encoding, consumed_lines = detect_encoding(readline)
840 self.assertEquals(encoding, 'utf-8')
841 self.assertEquals(consumed_lines, [b'print(something)\n'])
842
843 encoding, consumed_lines = detect_encoding(self.get_readline(()))
844 self.assertEquals(encoding, 'utf-8')
845 self.assertEquals(consumed_lines, [])
846
847 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
848 encoding, consumed_lines = detect_encoding(readline)
849 self.assertEquals(encoding, 'utf-8')
850 self.assertEquals(consumed_lines, [b'print(something)\n'])
851
852 readline = self.get_readline((b'\xef\xbb\xbf',))
853 encoding, consumed_lines = detect_encoding(readline)
854 self.assertEquals(encoding, 'utf-8')
855 self.assertEquals(consumed_lines, [])
856
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000857 readline = self.get_readline((b'# coding: bad\n',))
858 self.assertRaises(SyntaxError, detect_encoding, readline)
Trent Nelson428de652008-03-18 22:41:35 +0000859
860class TestTokenize(TestCase):
861
862 def test_tokenize(self):
863 import tokenize as tokenize_module
864 encoding = object()
865 encoding_used = None
866 def mock_detect_encoding(readline):
867 return encoding, ['first', 'second']
868
869 def mock__tokenize(readline, encoding):
870 nonlocal encoding_used
871 encoding_used = encoding
872 out = []
873 while True:
874 next_line = readline()
875 if next_line:
876 out.append(next_line)
877 continue
878 return out
879
880 counter = 0
881 def mock_readline():
882 nonlocal counter
883 counter += 1
884 if counter == 5:
885 return b''
886 return counter
887
888 orig_detect_encoding = tokenize_module.detect_encoding
889 orig__tokenize = tokenize_module._tokenize
890 tokenize_module.detect_encoding = mock_detect_encoding
891 tokenize_module._tokenize = mock__tokenize
892 try:
893 results = tokenize(mock_readline)
894 self.assertEquals(list(results), ['first', 'second', 1, 2, 3, 4])
895 finally:
896 tokenize_module.detect_encoding = orig_detect_encoding
897 tokenize_module._tokenize = orig__tokenize
898
899 self.assertTrue(encoding_used, encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000900
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000901
902__test__ = {"doctests" : doctests, 'decistmt': decistmt}
903
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000904def test_main():
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000905 from test import test_tokenize
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000906 support.run_doctest(test_tokenize, True)
907 support.run_unittest(TestTokenizerAdheresToPep0263)
908 support.run_unittest(Test_Tokenize)
909 support.run_unittest(TestDetectEncoding)
910 support.run_unittest(TestTokenize)
Neal Norwitzc1505362006-12-28 06:47:50 +0000911
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000912if __name__ == "__main__":
913 test_main()