blob: 510e632565f9b7ec8b6143eaae6730b34f8d1850 [file] [log] [blame]
Trent Nelson428de652008-03-18 22:41:35 +00001# -*- coding: utf-8 -*-
2
Christian Heimesdd15f6c2008-03-16 00:07:10 +00003doctests = """
4Tests for the tokenize module.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006The tests can be really simple. Given a small fragment of source
Eric Smith74ca5572008-03-17 19:49:19 +00007code, print out a table with tokens. The ENDMARK is omitted for
Thomas Wouters89f507f2006-12-13 04:49:30 +00008brevity.
9
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010 >>> dump_tokens("1 + 1")
Trent Nelson428de652008-03-18 22:41:35 +000011 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000012 NUMBER '1' (1, 0) (1, 1)
13 OP '+' (1, 2) (1, 3)
14 NUMBER '1' (1, 4) (1, 5)
Thomas Wouters89f507f2006-12-13 04:49:30 +000015
Christian Heimesdd15f6c2008-03-16 00:07:10 +000016 >>> dump_tokens("if False:\\n"
17 ... " # NL\\n"
18 ... " True = False # NEWLINE\\n")
Trent Nelson428de652008-03-18 22:41:35 +000019 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000020 NAME 'if' (1, 0) (1, 2)
21 NAME 'False' (1, 3) (1, 8)
22 OP ':' (1, 8) (1, 9)
23 NEWLINE '\\n' (1, 9) (1, 10)
24 COMMENT '# NL' (2, 4) (2, 8)
25 NL '\\n' (2, 8) (2, 9)
26 INDENT ' ' (3, 0) (3, 4)
27 NAME 'True' (3, 4) (3, 8)
28 OP '=' (3, 9) (3, 10)
29 NAME 'False' (3, 11) (3, 16)
30 COMMENT '# NEWLINE' (3, 17) (3, 26)
31 NEWLINE '\\n' (3, 26) (3, 27)
32 DEDENT '' (4, 0) (4, 0)
Thomas Wouters89f507f2006-12-13 04:49:30 +000033
Christian Heimesdd15f6c2008-03-16 00:07:10 +000034 >>> indent_error_file = \"""
35 ... def k(x):
36 ... x += 2
37 ... x += 5
38 ... \"""
Trent Nelson428de652008-03-18 22:41:35 +000039 >>> readline = BytesIO(indent_error_file.encode('utf-8')).readline
40 >>> for tok in tokenize(readline): pass
Christian Heimesdd15f6c2008-03-16 00:07:10 +000041 Traceback (most recent call last):
42 ...
43 IndentationError: unindent does not match any outer indentation level
Thomas Wouters89f507f2006-12-13 04:49:30 +000044
Christian Heimesdd15f6c2008-03-16 00:07:10 +000045There are some standard formattig practises that are easy to get right.
Thomas Wouters89f507f2006-12-13 04:49:30 +000046
Christian Heimesdd15f6c2008-03-16 00:07:10 +000047 >>> roundtrip("if x == 1:\\n"
48 ... " print(x)\\n")
49 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000050
Christian Heimesdd15f6c2008-03-16 00:07:10 +000051 >>> roundtrip("# This is a comment\\n# This also")
52 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000053
54Some people use different formatting conventions, which makes
Christian Heimesdd15f6c2008-03-16 00:07:10 +000055untokenize a little trickier. Note that this test involves trailing
56whitespace after the colon. Note that we use hex escapes to make the
Trent Nelson428de652008-03-18 22:41:35 +000057two trailing blanks apparent in the expected output.
Thomas Wouters89f507f2006-12-13 04:49:30 +000058
Christian Heimesdd15f6c2008-03-16 00:07:10 +000059 >>> roundtrip("if x == 1 : \\n"
60 ... " print(x)\\n")
61 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000062
Benjamin Petersonee8712c2008-05-20 21:35:26 +000063 >>> f = support.findfile("tokenize_tests.txt")
Trent Nelson428de652008-03-18 22:41:35 +000064 >>> roundtrip(open(f, 'rb'))
Christian Heimesdd15f6c2008-03-16 00:07:10 +000065 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000066
Christian Heimesdd15f6c2008-03-16 00:07:10 +000067 >>> roundtrip("if x == 1:\\n"
68 ... " # A comment by itself.\\n"
69 ... " print(x) # Comment here, too.\\n"
70 ... " # Another comment.\\n"
71 ... "after_if = True\\n")
72 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000073
Christian Heimesdd15f6c2008-03-16 00:07:10 +000074 >>> roundtrip("if (x # The comments need to go in the right place\\n"
75 ... " == 1):\\n"
76 ... " print('x==1')\\n")
77 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000078
Christian Heimesdd15f6c2008-03-16 00:07:10 +000079 >>> roundtrip("class Test: # A comment here\\n"
80 ... " # A comment with weird indent\\n"
81 ... " after_com = 5\\n"
82 ... " def x(m): return m*5 # a one liner\\n"
83 ... " def y(m): # A whitespace after the colon\\n"
84 ... " return y*4 # 3-space indent\\n")
85 True
86
87Some error-handling code
88
89 >>> roundtrip("try: import somemodule\\n"
90 ... "except ImportError: # comment\\n"
Christian Heimesba4af492008-03-28 00:55:15 +000091 ... " print('Can not import' # comment2\\n)"
Neal Norwitz752abd02008-05-13 04:55:24 +000092 ... "else: print('Loaded')\\n")
Christian Heimesdd15f6c2008-03-16 00:07:10 +000093 True
94
Eric Smith74ca5572008-03-17 19:49:19 +000095Balancing continuation
Christian Heimesdd15f6c2008-03-16 00:07:10 +000096
97 >>> roundtrip("a = (3,4, \\n"
98 ... "5,6)\\n"
99 ... "y = [3, 4,\\n"
100 ... "5]\\n"
101 ... "z = {'a': 5,\\n"
102 ... "'b':15, 'c':True}\\n"
103 ... "x = len(y) + 5 - a[\\n"
104 ... "3] - a[2]\\n"
105 ... "+ len(z) - z[\\n"
106 ... "'b']\\n")
107 True
108
109Ordinary integers and binary operators
110
111 >>> dump_tokens("0xff <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000112 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000113 NUMBER '0xff' (1, 0) (1, 4)
114 OP '<=' (1, 5) (1, 7)
115 NUMBER '255' (1, 8) (1, 11)
Eric Smith74ca5572008-03-17 19:49:19 +0000116 >>> dump_tokens("0b10 <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000117 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000118 NUMBER '0b10' (1, 0) (1, 4)
119 OP '<=' (1, 5) (1, 7)
120 NUMBER '255' (1, 8) (1, 11)
121 >>> dump_tokens("0o123 <= 0O123")
Trent Nelson428de652008-03-18 22:41:35 +0000122 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000123 NUMBER '0o123' (1, 0) (1, 5)
124 OP '<=' (1, 6) (1, 8)
125 NUMBER '0O123' (1, 9) (1, 14)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000126 >>> dump_tokens("1234567 > ~0x15")
Trent Nelson428de652008-03-18 22:41:35 +0000127 ENCODING 'utf-8' (0, 0) (0, 0)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000128 NUMBER '1234567' (1, 0) (1, 7)
129 OP '>' (1, 8) (1, 9)
130 OP '~' (1, 10) (1, 11)
131 NUMBER '0x15' (1, 11) (1, 15)
132 >>> dump_tokens("2134568 != 1231515")
Trent Nelson428de652008-03-18 22:41:35 +0000133 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000134 NUMBER '2134568' (1, 0) (1, 7)
135 OP '!=' (1, 8) (1, 10)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000136 NUMBER '1231515' (1, 11) (1, 18)
137 >>> dump_tokens("(-124561-1) & 200000000")
Trent Nelson428de652008-03-18 22:41:35 +0000138 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000139 OP '(' (1, 0) (1, 1)
140 OP '-' (1, 1) (1, 2)
141 NUMBER '124561' (1, 2) (1, 8)
142 OP '-' (1, 8) (1, 9)
143 NUMBER '1' (1, 9) (1, 10)
144 OP ')' (1, 10) (1, 11)
145 OP '&' (1, 12) (1, 13)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000146 NUMBER '200000000' (1, 14) (1, 23)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000147 >>> dump_tokens("0xdeadbeef != -1")
Trent Nelson428de652008-03-18 22:41:35 +0000148 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000149 NUMBER '0xdeadbeef' (1, 0) (1, 10)
150 OP '!=' (1, 11) (1, 13)
151 OP '-' (1, 14) (1, 15)
152 NUMBER '1' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000153 >>> dump_tokens("0xdeadc0de & 12345")
Trent Nelson428de652008-03-18 22:41:35 +0000154 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000155 NUMBER '0xdeadc0de' (1, 0) (1, 10)
156 OP '&' (1, 11) (1, 12)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000157 NUMBER '12345' (1, 13) (1, 18)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000158 >>> dump_tokens("0xFF & 0x15 | 1234")
Trent Nelson428de652008-03-18 22:41:35 +0000159 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000160 NUMBER '0xFF' (1, 0) (1, 4)
161 OP '&' (1, 5) (1, 6)
162 NUMBER '0x15' (1, 7) (1, 11)
163 OP '|' (1, 12) (1, 13)
164 NUMBER '1234' (1, 14) (1, 18)
165
166Long integers
167
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000168 >>> dump_tokens("x = 0")
Trent Nelson428de652008-03-18 22:41:35 +0000169 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000170 NAME 'x' (1, 0) (1, 1)
171 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000172 NUMBER '0' (1, 4) (1, 5)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000173 >>> dump_tokens("x = 0xfffffffffff")
Trent Nelson428de652008-03-18 22:41:35 +0000174 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000175 NAME 'x' (1, 0) (1, 1)
176 OP '=' (1, 2) (1, 3)
177 NUMBER '0xffffffffff (1, 4) (1, 17)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000178 >>> dump_tokens("x = 123141242151251616110")
Trent Nelson428de652008-03-18 22:41:35 +0000179 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000180 NAME 'x' (1, 0) (1, 1)
181 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000182 NUMBER '123141242151 (1, 4) (1, 25)
183 >>> dump_tokens("x = -15921590215012591")
Trent Nelson428de652008-03-18 22:41:35 +0000184 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000185 NAME 'x' (1, 0) (1, 1)
186 OP '=' (1, 2) (1, 3)
187 OP '-' (1, 4) (1, 5)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000188 NUMBER '159215902150 (1, 5) (1, 22)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000189
190Floating point numbers
191
192 >>> dump_tokens("x = 3.14159")
Trent Nelson428de652008-03-18 22:41:35 +0000193 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000194 NAME 'x' (1, 0) (1, 1)
195 OP '=' (1, 2) (1, 3)
196 NUMBER '3.14159' (1, 4) (1, 11)
197 >>> dump_tokens("x = 314159.")
Trent Nelson428de652008-03-18 22:41:35 +0000198 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000199 NAME 'x' (1, 0) (1, 1)
200 OP '=' (1, 2) (1, 3)
201 NUMBER '314159.' (1, 4) (1, 11)
202 >>> dump_tokens("x = .314159")
Trent Nelson428de652008-03-18 22:41:35 +0000203 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000204 NAME 'x' (1, 0) (1, 1)
205 OP '=' (1, 2) (1, 3)
206 NUMBER '.314159' (1, 4) (1, 11)
207 >>> dump_tokens("x = 3e14159")
Trent Nelson428de652008-03-18 22:41:35 +0000208 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000209 NAME 'x' (1, 0) (1, 1)
210 OP '=' (1, 2) (1, 3)
211 NUMBER '3e14159' (1, 4) (1, 11)
212 >>> dump_tokens("x = 3E123")
Trent Nelson428de652008-03-18 22:41:35 +0000213 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000214 NAME 'x' (1, 0) (1, 1)
215 OP '=' (1, 2) (1, 3)
216 NUMBER '3E123' (1, 4) (1, 9)
217 >>> dump_tokens("x+y = 3e-1230")
Trent Nelson428de652008-03-18 22:41:35 +0000218 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000219 NAME 'x' (1, 0) (1, 1)
220 OP '+' (1, 1) (1, 2)
221 NAME 'y' (1, 2) (1, 3)
222 OP '=' (1, 4) (1, 5)
223 NUMBER '3e-1230' (1, 6) (1, 13)
224 >>> dump_tokens("x = 3.14e159")
Trent Nelson428de652008-03-18 22:41:35 +0000225 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000226 NAME 'x' (1, 0) (1, 1)
227 OP '=' (1, 2) (1, 3)
228 NUMBER '3.14e159' (1, 4) (1, 12)
229
230String literals
231
232 >>> dump_tokens("x = ''; y = \\\"\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000233 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000234 NAME 'x' (1, 0) (1, 1)
235 OP '=' (1, 2) (1, 3)
236 STRING "''" (1, 4) (1, 6)
237 OP ';' (1, 6) (1, 7)
238 NAME 'y' (1, 8) (1, 9)
239 OP '=' (1, 10) (1, 11)
240 STRING '""' (1, 12) (1, 14)
241 >>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000242 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000243 NAME 'x' (1, 0) (1, 1)
244 OP '=' (1, 2) (1, 3)
245 STRING '\\'"\\'' (1, 4) (1, 7)
246 OP ';' (1, 7) (1, 8)
247 NAME 'y' (1, 9) (1, 10)
248 OP '=' (1, 11) (1, 12)
249 STRING '"\\'"' (1, 13) (1, 16)
250 >>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000251 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000252 NAME 'x' (1, 0) (1, 1)
253 OP '=' (1, 2) (1, 3)
254 STRING '"doesn\\'t "' (1, 4) (1, 14)
255 NAME 'shrink' (1, 14) (1, 20)
256 STRING '", does it"' (1, 20) (1, 31)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000257 >>> dump_tokens("x = 'abc' + 'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000258 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000259 NAME 'x' (1, 0) (1, 1)
260 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000261 STRING "'abc'" (1, 4) (1, 9)
262 OP '+' (1, 10) (1, 11)
263 STRING "'ABC'" (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000264 >>> dump_tokens('y = "ABC" + "ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000265 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000266 NAME 'y' (1, 0) (1, 1)
267 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000268 STRING '"ABC"' (1, 4) (1, 9)
269 OP '+' (1, 10) (1, 11)
270 STRING '"ABC"' (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000271 >>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000272 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000273 NAME 'x' (1, 0) (1, 1)
274 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000275 STRING "r'abc'" (1, 4) (1, 10)
276 OP '+' (1, 11) (1, 12)
277 STRING "r'ABC'" (1, 13) (1, 19)
278 OP '+' (1, 20) (1, 21)
279 STRING "R'ABC'" (1, 22) (1, 28)
280 OP '+' (1, 29) (1, 30)
281 STRING "R'ABC'" (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000282 >>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000283 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000284 NAME 'y' (1, 0) (1, 1)
285 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000286 STRING 'r"abc"' (1, 4) (1, 10)
287 OP '+' (1, 11) (1, 12)
288 STRING 'r"ABC"' (1, 13) (1, 19)
289 OP '+' (1, 20) (1, 21)
290 STRING 'R"ABC"' (1, 22) (1, 28)
291 OP '+' (1, 29) (1, 30)
292 STRING 'R"ABC"' (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000293
294Operators
295
296 >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000297 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000298 NAME 'def' (1, 0) (1, 3)
299 NAME 'd22' (1, 4) (1, 7)
300 OP '(' (1, 7) (1, 8)
301 NAME 'a' (1, 8) (1, 9)
302 OP ',' (1, 9) (1, 10)
303 NAME 'b' (1, 11) (1, 12)
304 OP ',' (1, 12) (1, 13)
305 NAME 'c' (1, 14) (1, 15)
306 OP '=' (1, 15) (1, 16)
307 NUMBER '2' (1, 16) (1, 17)
308 OP ',' (1, 17) (1, 18)
309 NAME 'd' (1, 19) (1, 20)
310 OP '=' (1, 20) (1, 21)
311 NUMBER '2' (1, 21) (1, 22)
312 OP ',' (1, 22) (1, 23)
313 OP '*' (1, 24) (1, 25)
314 NAME 'k' (1, 25) (1, 26)
315 OP ')' (1, 26) (1, 27)
316 OP ':' (1, 27) (1, 28)
317 NAME 'pass' (1, 29) (1, 33)
318 >>> dump_tokens("def d01v_(a=1, *k, **w): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000319 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000320 NAME 'def' (1, 0) (1, 3)
321 NAME 'd01v_' (1, 4) (1, 9)
322 OP '(' (1, 9) (1, 10)
323 NAME 'a' (1, 10) (1, 11)
324 OP '=' (1, 11) (1, 12)
325 NUMBER '1' (1, 12) (1, 13)
326 OP ',' (1, 13) (1, 14)
327 OP '*' (1, 15) (1, 16)
328 NAME 'k' (1, 16) (1, 17)
329 OP ',' (1, 17) (1, 18)
330 OP '**' (1, 19) (1, 21)
331 NAME 'w' (1, 21) (1, 22)
332 OP ')' (1, 22) (1, 23)
333 OP ':' (1, 23) (1, 24)
334 NAME 'pass' (1, 25) (1, 29)
335
336Comparison
337
338 >>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +
339 ... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")
Trent Nelson428de652008-03-18 22:41:35 +0000340 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000341 NAME 'if' (1, 0) (1, 2)
342 NUMBER '1' (1, 3) (1, 4)
343 OP '<' (1, 5) (1, 6)
344 NUMBER '1' (1, 7) (1, 8)
345 OP '>' (1, 9) (1, 10)
346 NUMBER '1' (1, 11) (1, 12)
347 OP '==' (1, 13) (1, 15)
348 NUMBER '1' (1, 16) (1, 17)
349 OP '>=' (1, 18) (1, 20)
350 NUMBER '5' (1, 21) (1, 22)
351 OP '<=' (1, 23) (1, 25)
352 NUMBER '0x15' (1, 26) (1, 30)
353 OP '<=' (1, 31) (1, 33)
354 NUMBER '0x12' (1, 34) (1, 38)
355 OP '!=' (1, 39) (1, 41)
356 NUMBER '1' (1, 42) (1, 43)
357 NAME 'and' (1, 44) (1, 47)
358 NUMBER '5' (1, 48) (1, 49)
359 NAME 'in' (1, 50) (1, 52)
360 NUMBER '1' (1, 53) (1, 54)
361 NAME 'not' (1, 55) (1, 58)
362 NAME 'in' (1, 59) (1, 61)
363 NUMBER '1' (1, 62) (1, 63)
364 NAME 'is' (1, 64) (1, 66)
365 NUMBER '1' (1, 67) (1, 68)
366 NAME 'or' (1, 69) (1, 71)
367 NUMBER '5' (1, 72) (1, 73)
368 NAME 'is' (1, 74) (1, 76)
369 NAME 'not' (1, 77) (1, 80)
370 NUMBER '1' (1, 81) (1, 82)
371 OP ':' (1, 82) (1, 83)
372 NAME 'pass' (1, 84) (1, 88)
373
374Shift
375
376 >>> dump_tokens("x = 1 << 1 >> 5")
Trent Nelson428de652008-03-18 22:41:35 +0000377 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000378 NAME 'x' (1, 0) (1, 1)
379 OP '=' (1, 2) (1, 3)
380 NUMBER '1' (1, 4) (1, 5)
381 OP '<<' (1, 6) (1, 8)
382 NUMBER '1' (1, 9) (1, 10)
383 OP '>>' (1, 11) (1, 13)
384 NUMBER '5' (1, 14) (1, 15)
385
386Additive
387
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000388 >>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")
Trent Nelson428de652008-03-18 22:41:35 +0000389 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000390 NAME 'x' (1, 0) (1, 1)
391 OP '=' (1, 2) (1, 3)
392 NUMBER '1' (1, 4) (1, 5)
393 OP '-' (1, 6) (1, 7)
394 NAME 'y' (1, 8) (1, 9)
395 OP '+' (1, 10) (1, 11)
396 NUMBER '15' (1, 12) (1, 14)
397 OP '-' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000398 NUMBER '1' (1, 17) (1, 18)
399 OP '+' (1, 19) (1, 20)
400 NUMBER '0x124' (1, 21) (1, 26)
401 OP '+' (1, 27) (1, 28)
402 NAME 'z' (1, 29) (1, 30)
403 OP '+' (1, 31) (1, 32)
404 NAME 'a' (1, 33) (1, 34)
405 OP '[' (1, 34) (1, 35)
406 NUMBER '5' (1, 35) (1, 36)
407 OP ']' (1, 36) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000408
409Multiplicative
410
411 >>> dump_tokens("x = 1//1*1/5*12%0x12")
Trent Nelson428de652008-03-18 22:41:35 +0000412 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000413 NAME 'x' (1, 0) (1, 1)
414 OP '=' (1, 2) (1, 3)
415 NUMBER '1' (1, 4) (1, 5)
416 OP '//' (1, 5) (1, 7)
417 NUMBER '1' (1, 7) (1, 8)
418 OP '*' (1, 8) (1, 9)
419 NUMBER '1' (1, 9) (1, 10)
420 OP '/' (1, 10) (1, 11)
421 NUMBER '5' (1, 11) (1, 12)
422 OP '*' (1, 12) (1, 13)
423 NUMBER '12' (1, 13) (1, 15)
424 OP '%' (1, 15) (1, 16)
425 NUMBER '0x12' (1, 16) (1, 20)
426
427Unary
428
429 >>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")
Trent Nelson428de652008-03-18 22:41:35 +0000430 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000431 OP '~' (1, 0) (1, 1)
432 NUMBER '1' (1, 1) (1, 2)
433 OP '^' (1, 3) (1, 4)
434 NUMBER '1' (1, 5) (1, 6)
435 OP '&' (1, 7) (1, 8)
436 NUMBER '1' (1, 9) (1, 10)
437 OP '|' (1, 11) (1, 12)
438 NUMBER '1' (1, 12) (1, 13)
439 OP '^' (1, 14) (1, 15)
440 OP '-' (1, 16) (1, 17)
441 NUMBER '1' (1, 17) (1, 18)
442 >>> dump_tokens("-1*1/1+1*1//1 - ---1**1")
Trent Nelson428de652008-03-18 22:41:35 +0000443 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000444 OP '-' (1, 0) (1, 1)
445 NUMBER '1' (1, 1) (1, 2)
446 OP '*' (1, 2) (1, 3)
447 NUMBER '1' (1, 3) (1, 4)
448 OP '/' (1, 4) (1, 5)
449 NUMBER '1' (1, 5) (1, 6)
450 OP '+' (1, 6) (1, 7)
451 NUMBER '1' (1, 7) (1, 8)
452 OP '*' (1, 8) (1, 9)
453 NUMBER '1' (1, 9) (1, 10)
454 OP '//' (1, 10) (1, 12)
455 NUMBER '1' (1, 12) (1, 13)
456 OP '-' (1, 14) (1, 15)
457 OP '-' (1, 16) (1, 17)
458 OP '-' (1, 17) (1, 18)
459 OP '-' (1, 18) (1, 19)
460 NUMBER '1' (1, 19) (1, 20)
461 OP '**' (1, 20) (1, 22)
462 NUMBER '1' (1, 22) (1, 23)
463
464Selector
465
466 >>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")
Trent Nelson428de652008-03-18 22:41:35 +0000467 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000468 NAME 'import' (1, 0) (1, 6)
469 NAME 'sys' (1, 7) (1, 10)
470 OP ',' (1, 10) (1, 11)
471 NAME 'time' (1, 12) (1, 16)
472 NEWLINE '\\n' (1, 16) (1, 17)
473 NAME 'x' (2, 0) (2, 1)
474 OP '=' (2, 2) (2, 3)
475 NAME 'sys' (2, 4) (2, 7)
476 OP '.' (2, 7) (2, 8)
477 NAME 'modules' (2, 8) (2, 15)
478 OP '[' (2, 15) (2, 16)
479 STRING "'time'" (2, 16) (2, 22)
480 OP ']' (2, 22) (2, 23)
481 OP '.' (2, 23) (2, 24)
482 NAME 'time' (2, 24) (2, 28)
483 OP '(' (2, 28) (2, 29)
484 OP ')' (2, 29) (2, 30)
485
486Methods
487
488 >>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000489 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000490 OP '@' (1, 0) (1, 1)
491 NAME 'staticmethod (1, 1) (1, 13)
492 NEWLINE '\\n' (1, 13) (1, 14)
493 NAME 'def' (2, 0) (2, 3)
494 NAME 'foo' (2, 4) (2, 7)
495 OP '(' (2, 7) (2, 8)
496 NAME 'x' (2, 8) (2, 9)
497 OP ',' (2, 9) (2, 10)
498 NAME 'y' (2, 10) (2, 11)
499 OP ')' (2, 11) (2, 12)
500 OP ':' (2, 12) (2, 13)
501 NAME 'pass' (2, 14) (2, 18)
502
503Backslash means line continuation, except for comments
504
505 >>> roundtrip("x=1+\\\\n"
506 ... "1\\n"
507 ... "# This is a comment\\\\n"
508 ... "# This also\\n")
509 True
510 >>> roundtrip("# Comment \\\\nx = 0")
511 True
Christian Heimesba4af492008-03-28 00:55:15 +0000512
513Two string literals on the same line
514
515 >>> roundtrip("'' ''")
516 True
517
518Test roundtrip on random python modules.
Antoine Pitrou27314942010-10-14 15:41:23 +0000519pass the '-ucpu' option to process the full directory.
Christian Heimesba4af492008-03-28 00:55:15 +0000520
521 >>> import random
522 >>> tempdir = os.path.dirname(f) or os.curdir
523 >>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
524
Antoine Pitrou27314942010-10-14 15:41:23 +0000525 >>> if not support.is_resource_enabled("cpu"):
Christian Heimesba4af492008-03-28 00:55:15 +0000526 ... testfiles = random.sample(testfiles, 10)
527 ...
528 >>> for testfile in testfiles:
529 ... if not roundtrip(open(testfile, 'rb')):
530 ... print("Roundtrip failed for file %s" % testfile)
531 ... break
532 ... else: True
533 True
Benjamin Peterson8f6713f2009-11-13 02:29:35 +0000534
535Evil tabs
Benjamin Peterson66428b22010-08-30 14:44:53 +0000536
Benjamin Peterson8f6713f2009-11-13 02:29:35 +0000537 >>> dump_tokens("def f():\\n\\tif x\\n \\tpass")
538 ENCODING 'utf-8' (0, 0) (0, 0)
539 NAME 'def' (1, 0) (1, 3)
540 NAME 'f' (1, 4) (1, 5)
541 OP '(' (1, 5) (1, 6)
542 OP ')' (1, 6) (1, 7)
543 OP ':' (1, 7) (1, 8)
544 NEWLINE '\\n' (1, 8) (1, 9)
545 INDENT '\\t' (2, 0) (2, 1)
546 NAME 'if' (2, 1) (2, 3)
547 NAME 'x' (2, 4) (2, 5)
548 NEWLINE '\\n' (2, 5) (2, 6)
549 INDENT ' \\t' (3, 0) (3, 9)
550 NAME 'pass' (3, 9) (3, 13)
551 DEDENT '' (4, 0) (4, 0)
552 DEDENT '' (4, 0) (4, 0)
Benjamin Peterson66428b22010-08-30 14:44:53 +0000553
554Non-ascii identifiers
555
556 >>> dump_tokens("Örter = 'places'\\ngrün = 'green'")
557 ENCODING 'utf-8' (0, 0) (0, 0)
558 NAME 'Örter' (1, 0) (1, 5)
559 OP '=' (1, 6) (1, 7)
560 STRING "'places'" (1, 8) (1, 16)
561 NEWLINE '\\n' (1, 16) (1, 17)
562 NAME 'grün' (2, 0) (2, 4)
563 OP '=' (2, 5) (2, 6)
564 STRING "'green'" (2, 7) (2, 14)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000565"""
566
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000567from test import support
Trent Nelson428de652008-03-18 22:41:35 +0000568from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
569 STRING, ENDMARKER, tok_name, detect_encoding)
570from io import BytesIO
571from unittest import TestCase
572import os, sys, glob
Raymond Hettinger68c04532005-06-10 11:05:19 +0000573
Thomas Wouters89f507f2006-12-13 04:49:30 +0000574def dump_tokens(s):
575 """Print out the tokens in s in a table format.
576
577 The ENDMARKER is omitted.
578 """
Trent Nelson428de652008-03-18 22:41:35 +0000579 f = BytesIO(s.encode('utf-8'))
580 for type, token, start, end, line in tokenize(f.readline):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000581 if type == ENDMARKER:
582 break
583 type = tok_name[type]
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000584 print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters89f507f2006-12-13 04:49:30 +0000585
Trent Nelson428de652008-03-18 22:41:35 +0000586def roundtrip(f):
587 """
588 Test roundtrip for `untokenize`. `f` is an open file or a string.
589 The source code in f is tokenized, converted back to source code via
590 tokenize.untokenize(), and tokenized again from the latter. The test
591 fails if the second tokenization doesn't match the first.
592 """
593 if isinstance(f, str):
594 f = BytesIO(f.encode('utf-8'))
Brian Curtina0ba0f32010-10-30 21:37:28 +0000595 try:
596 token_list = list(tokenize(f.readline))
597 finally:
598 f.close()
Trent Nelson428de652008-03-18 22:41:35 +0000599 tokens1 = [tok[:2] for tok in token_list]
600 new_bytes = untokenize(tokens1)
601 readline = (line for line in new_bytes.splitlines(1)).__next__
602 tokens2 = [tok[:2] for tok in tokenize(readline)]
603 return tokens1 == tokens2
Thomas Wouters89f507f2006-12-13 04:49:30 +0000604
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000605# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000606def decistmt(s):
607 """Substitute Decimals for floats in a string of statements.
608
609 >>> from decimal import Decimal
Georg Brandl88fc6642007-02-09 21:28:07 +0000610 >>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000611 >>> decistmt(s)
Georg Brandl88fc6642007-02-09 21:28:07 +0000612 "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger68c04532005-06-10 11:05:19 +0000613
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000614 The format of the exponent is inherited from the platform C library.
615 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
616 we're only showing 12 digits, and the 13th isn't close to 5, the
617 rest of the output should be platform-independent.
618
619 >>> exec(s) #doctest: +ELLIPSIS
620 -3.21716034272e-0...7
621
622 Output from calculations with Decimal should be identical across all
623 platforms.
624
Raymond Hettinger68c04532005-06-10 11:05:19 +0000625 >>> exec(decistmt(s))
626 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +0000627 """
628 result = []
Trent Nelson428de652008-03-18 22:41:35 +0000629 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
Raymond Hettinger68c04532005-06-10 11:05:19 +0000630 for toknum, tokval, _, _, _ in g:
631 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
632 result.extend([
633 (NAME, 'Decimal'),
634 (OP, '('),
635 (STRING, repr(tokval)),
636 (OP, ')')
637 ])
638 else:
639 result.append((toknum, tokval))
Trent Nelson428de652008-03-18 22:41:35 +0000640 return untokenize(result).decode('utf-8')
641
642
643class TestTokenizerAdheresToPep0263(TestCase):
644 """
645 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
646 """
647
648 def _testFile(self, filename):
649 path = os.path.join(os.path.dirname(__file__), filename)
650 return roundtrip(open(path, 'rb'))
651
652 def test_utf8_coding_cookie_and_no_utf8_bom(self):
653 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
654 self.assertTrue(self._testFile(f))
655
656 def test_latin1_coding_cookie_and_utf8_bom(self):
657 """
658 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
659 allowed encoding for the comment is 'utf-8'. The text file used in
660 this test starts with a BOM signature, but specifies latin1 as the
661 coding, so verify that a SyntaxError is raised, which matches the
662 behaviour of the interpreter when it encounters a similar condition.
663 """
664 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
Georg Brandlab91fde2009-08-13 08:51:18 +0000665 self.assertRaises(SyntaxError, self._testFile, f)
Trent Nelson428de652008-03-18 22:41:35 +0000666
667 def test_no_coding_cookie_and_utf8_bom(self):
668 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
669 self.assertTrue(self._testFile(f))
670
671 def test_utf8_coding_cookie_and_utf8_bom(self):
672 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
673 self.assertTrue(self._testFile(f))
674
675
676class Test_Tokenize(TestCase):
677
678 def test__tokenize_decodes_with_specified_encoding(self):
679 literal = '"ЉЊЈЁЂ"'
680 line = literal.encode('utf-8')
681 first = False
682 def readline():
683 nonlocal first
684 if not first:
685 first = True
686 return line
687 else:
688 return b''
689
690 # skip the initial encoding token and the end token
691 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
692 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000693 self.assertEqual(tokens, expected_tokens,
694 "bytes not decoded with encoding")
Trent Nelson428de652008-03-18 22:41:35 +0000695
696 def test__tokenize_does_not_decode_with_encoding_none(self):
697 literal = '"ЉЊЈЁЂ"'
698 first = False
699 def readline():
700 nonlocal first
701 if not first:
702 first = True
703 return literal
704 else:
705 return b''
706
707 # skip the end token
708 tokens = list(_tokenize(readline, encoding=None))[:-1]
709 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000710 self.assertEqual(tokens, expected_tokens,
711 "string not tokenized when encoding is None")
Trent Nelson428de652008-03-18 22:41:35 +0000712
713
714class TestDetectEncoding(TestCase):
715
716 def get_readline(self, lines):
717 index = 0
718 def readline():
719 nonlocal index
720 if index == len(lines):
721 raise StopIteration
722 line = lines[index]
723 index += 1
724 return line
725 return readline
726
727 def test_no_bom_no_encoding_cookie(self):
728 lines = (
729 b'# something\n',
730 b'print(something)\n',
731 b'do_something(else)\n'
732 )
733 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000734 self.assertEqual(encoding, 'utf-8')
735 self.assertEqual(consumed_lines, list(lines[:2]))
Trent Nelson428de652008-03-18 22:41:35 +0000736
737 def test_bom_no_cookie(self):
738 lines = (
739 b'\xef\xbb\xbf# something\n',
740 b'print(something)\n',
741 b'do_something(else)\n'
742 )
743 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000744 self.assertEqual(encoding, 'utf-8')
745 self.assertEqual(consumed_lines,
746 [b'# something\n', b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000747
748 def test_cookie_first_line_no_bom(self):
749 lines = (
750 b'# -*- coding: latin-1 -*-\n',
751 b'print(something)\n',
752 b'do_something(else)\n'
753 )
754 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000755 self.assertEqual(encoding, 'iso-8859-1')
756 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000757
758 def test_matched_bom_and_cookie_first_line(self):
759 lines = (
760 b'\xef\xbb\xbf# coding=utf-8\n',
761 b'print(something)\n',
762 b'do_something(else)\n'
763 )
764 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000765 self.assertEqual(encoding, 'utf-8')
766 self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000767
768 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
769 lines = (
770 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
771 b'print(something)\n',
772 b'do_something(else)\n'
773 )
774 readline = self.get_readline(lines)
775 self.assertRaises(SyntaxError, detect_encoding, readline)
776
777 def test_cookie_second_line_no_bom(self):
778 lines = (
779 b'#! something\n',
780 b'# vim: set fileencoding=ascii :\n',
781 b'print(something)\n',
782 b'do_something(else)\n'
783 )
784 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000785 self.assertEqual(encoding, 'ascii')
Trent Nelson428de652008-03-18 22:41:35 +0000786 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000787 self.assertEqual(consumed_lines, expected)
Trent Nelson428de652008-03-18 22:41:35 +0000788
789 def test_matched_bom_and_cookie_second_line(self):
790 lines = (
791 b'\xef\xbb\xbf#! something\n',
792 b'f# coding=utf-8\n',
793 b'print(something)\n',
794 b'do_something(else)\n'
795 )
796 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000797 self.assertEqual(encoding, 'utf-8')
798 self.assertEqual(consumed_lines,
799 [b'#! something\n', b'f# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000800
801 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
802 lines = (
803 b'\xef\xbb\xbf#! something\n',
804 b'# vim: set fileencoding=ascii :\n',
805 b'print(something)\n',
806 b'do_something(else)\n'
807 )
808 readline = self.get_readline(lines)
809 self.assertRaises(SyntaxError, detect_encoding, readline)
810
Benjamin Peterson0c7f9c92009-10-09 21:53:27 +0000811 def test_latin1_normalization(self):
812 # See get_normal_name() in tokenizer.c.
813 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
814 "iso-8859-1-unix", "iso-latin-1-mac")
815 for encoding in encodings:
816 for rep in ("-", "_"):
817 enc = encoding.replace("-", rep)
818 lines = (b"#!/usr/bin/python\n",
819 b"# coding: " + enc.encode("ascii") + b"\n",
820 b"print(things)\n",
821 b"do_something += 4\n")
822 rl = self.get_readline(lines)
823 found, consumed_lines = detect_encoding(rl)
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000824 self.assertEqual(found, "iso-8859-1")
Benjamin Peterson0c7f9c92009-10-09 21:53:27 +0000825
826 def test_utf8_normalization(self):
827 # See get_normal_name() in tokenizer.c.
828 encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
829 for encoding in encodings:
830 for rep in ("-", "_"):
831 enc = encoding.replace("-", rep)
832 lines = (b"#!/usr/bin/python\n",
833 b"# coding: " + enc.encode("ascii") + b"\n",
834 b"1 + 3\n")
835 rl = self.get_readline(lines)
836 found, consumed_lines = detect_encoding(rl)
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000837 self.assertEqual(found, "utf-8")
Benjamin Peterson0c7f9c92009-10-09 21:53:27 +0000838
Trent Nelson428de652008-03-18 22:41:35 +0000839 def test_short_files(self):
840 readline = self.get_readline((b'print(something)\n',))
841 encoding, consumed_lines = detect_encoding(readline)
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000842 self.assertEqual(encoding, 'utf-8')
843 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000844
845 encoding, consumed_lines = detect_encoding(self.get_readline(()))
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000846 self.assertEqual(encoding, 'utf-8')
847 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +0000848
849 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
850 encoding, consumed_lines = detect_encoding(readline)
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000851 self.assertEqual(encoding, 'utf-8')
852 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000853
854 readline = self.get_readline((b'\xef\xbb\xbf',))
855 encoding, consumed_lines = detect_encoding(readline)
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000856 self.assertEqual(encoding, 'utf-8')
857 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +0000858
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000859 readline = self.get_readline((b'# coding: bad\n',))
860 self.assertRaises(SyntaxError, detect_encoding, readline)
Trent Nelson428de652008-03-18 22:41:35 +0000861
862class TestTokenize(TestCase):
863
864 def test_tokenize(self):
865 import tokenize as tokenize_module
866 encoding = object()
867 encoding_used = None
868 def mock_detect_encoding(readline):
869 return encoding, ['first', 'second']
870
871 def mock__tokenize(readline, encoding):
872 nonlocal encoding_used
873 encoding_used = encoding
874 out = []
875 while True:
876 next_line = readline()
877 if next_line:
878 out.append(next_line)
879 continue
880 return out
881
882 counter = 0
883 def mock_readline():
884 nonlocal counter
885 counter += 1
886 if counter == 5:
887 return b''
888 return counter
889
890 orig_detect_encoding = tokenize_module.detect_encoding
891 orig__tokenize = tokenize_module._tokenize
892 tokenize_module.detect_encoding = mock_detect_encoding
893 tokenize_module._tokenize = mock__tokenize
894 try:
895 results = tokenize(mock_readline)
Ezio Melotti19f2aeb2010-11-21 01:30:29 +0000896 self.assertEqual(list(results), ['first', 'second', 1, 2, 3, 4])
Trent Nelson428de652008-03-18 22:41:35 +0000897 finally:
898 tokenize_module.detect_encoding = orig_detect_encoding
899 tokenize_module._tokenize = orig__tokenize
900
901 self.assertTrue(encoding_used, encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000902
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000903
904__test__ = {"doctests" : doctests, 'decistmt': decistmt}
905
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000906def test_main():
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000907 from test import test_tokenize
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000908 support.run_doctest(test_tokenize, True)
909 support.run_unittest(TestTokenizerAdheresToPep0263)
910 support.run_unittest(Test_Tokenize)
911 support.run_unittest(TestDetectEncoding)
912 support.run_unittest(TestTokenize)
Neal Norwitzc1505362006-12-28 06:47:50 +0000913
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000914if __name__ == "__main__":
915 test_main()