blob: db87e11da4748cc0ac50ad0fb63a8e4966870091 [file] [log] [blame]
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001doctests = """
2Tests for the tokenize module.
Thomas Wouters89f507f2006-12-13 04:49:30 +00003
Christian Heimesdd15f6c2008-03-16 00:07:10 +00004The tests can be really simple. Given a small fragment of source
Eric Smith74ca5572008-03-17 19:49:19 +00005code, print out a table with tokens. The ENDMARK is omitted for
Thomas Wouters89f507f2006-12-13 04:49:30 +00006brevity.
7
Christian Heimesdd15f6c2008-03-16 00:07:10 +00008 >>> dump_tokens("1 + 1")
Trent Nelson428de652008-03-18 22:41:35 +00009 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010 NUMBER '1' (1, 0) (1, 1)
11 OP '+' (1, 2) (1, 3)
12 NUMBER '1' (1, 4) (1, 5)
Thomas Wouters89f507f2006-12-13 04:49:30 +000013
Christian Heimesdd15f6c2008-03-16 00:07:10 +000014 >>> dump_tokens("if False:\\n"
15 ... " # NL\\n"
16 ... " True = False # NEWLINE\\n")
Trent Nelson428de652008-03-18 22:41:35 +000017 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000018 NAME 'if' (1, 0) (1, 2)
19 NAME 'False' (1, 3) (1, 8)
20 OP ':' (1, 8) (1, 9)
21 NEWLINE '\\n' (1, 9) (1, 10)
22 COMMENT '# NL' (2, 4) (2, 8)
23 NL '\\n' (2, 8) (2, 9)
24 INDENT ' ' (3, 0) (3, 4)
25 NAME 'True' (3, 4) (3, 8)
26 OP '=' (3, 9) (3, 10)
27 NAME 'False' (3, 11) (3, 16)
28 COMMENT '# NEWLINE' (3, 17) (3, 26)
29 NEWLINE '\\n' (3, 26) (3, 27)
30 DEDENT '' (4, 0) (4, 0)
Thomas Wouters89f507f2006-12-13 04:49:30 +000031
Christian Heimesdd15f6c2008-03-16 00:07:10 +000032 >>> indent_error_file = \"""
33 ... def k(x):
34 ... x += 2
35 ... x += 5
36 ... \"""
Trent Nelson428de652008-03-18 22:41:35 +000037 >>> readline = BytesIO(indent_error_file.encode('utf-8')).readline
38 >>> for tok in tokenize(readline): pass
Christian Heimesdd15f6c2008-03-16 00:07:10 +000039 Traceback (most recent call last):
40 ...
41 IndentationError: unindent does not match any outer indentation level
Thomas Wouters89f507f2006-12-13 04:49:30 +000042
Mark Dickinson3c0b3172010-06-29 07:38:37 +000043There are some standard formatting practices that are easy to get right.
Thomas Wouters89f507f2006-12-13 04:49:30 +000044
Christian Heimesdd15f6c2008-03-16 00:07:10 +000045 >>> roundtrip("if x == 1:\\n"
46 ... " print(x)\\n")
47 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000048
Christian Heimesdd15f6c2008-03-16 00:07:10 +000049 >>> roundtrip("# This is a comment\\n# This also")
50 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000051
52Some people use different formatting conventions, which makes
Christian Heimesdd15f6c2008-03-16 00:07:10 +000053untokenize a little trickier. Note that this test involves trailing
54whitespace after the colon. Note that we use hex escapes to make the
Trent Nelson428de652008-03-18 22:41:35 +000055two trailing blanks apparent in the expected output.
Thomas Wouters89f507f2006-12-13 04:49:30 +000056
Christian Heimesdd15f6c2008-03-16 00:07:10 +000057 >>> roundtrip("if x == 1 : \\n"
58 ... " print(x)\\n")
59 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000060
Benjamin Petersonee8712c2008-05-20 21:35:26 +000061 >>> f = support.findfile("tokenize_tests.txt")
Trent Nelson428de652008-03-18 22:41:35 +000062 >>> roundtrip(open(f, 'rb'))
Christian Heimesdd15f6c2008-03-16 00:07:10 +000063 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000064
Christian Heimesdd15f6c2008-03-16 00:07:10 +000065 >>> roundtrip("if x == 1:\\n"
66 ... " # A comment by itself.\\n"
67 ... " print(x) # Comment here, too.\\n"
68 ... " # Another comment.\\n"
69 ... "after_if = True\\n")
70 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000071
Christian Heimesdd15f6c2008-03-16 00:07:10 +000072 >>> roundtrip("if (x # The comments need to go in the right place\\n"
73 ... " == 1):\\n"
74 ... " print('x==1')\\n")
75 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000076
Christian Heimesdd15f6c2008-03-16 00:07:10 +000077 >>> roundtrip("class Test: # A comment here\\n"
78 ... " # A comment with weird indent\\n"
79 ... " after_com = 5\\n"
80 ... " def x(m): return m*5 # a one liner\\n"
81 ... " def y(m): # A whitespace after the colon\\n"
82 ... " return y*4 # 3-space indent\\n")
83 True
84
85Some error-handling code
86
87 >>> roundtrip("try: import somemodule\\n"
88 ... "except ImportError: # comment\\n"
Christian Heimesba4af492008-03-28 00:55:15 +000089 ... " print('Can not import' # comment2\\n)"
Neal Norwitz752abd02008-05-13 04:55:24 +000090 ... "else: print('Loaded')\\n")
Christian Heimesdd15f6c2008-03-16 00:07:10 +000091 True
92
Eric Smith74ca5572008-03-17 19:49:19 +000093Balancing continuation
Christian Heimesdd15f6c2008-03-16 00:07:10 +000094
95 >>> roundtrip("a = (3,4, \\n"
96 ... "5,6)\\n"
97 ... "y = [3, 4,\\n"
98 ... "5]\\n"
99 ... "z = {'a': 5,\\n"
100 ... "'b':15, 'c':True}\\n"
101 ... "x = len(y) + 5 - a[\\n"
102 ... "3] - a[2]\\n"
103 ... "+ len(z) - z[\\n"
104 ... "'b']\\n")
105 True
106
107Ordinary integers and binary operators
108
109 >>> dump_tokens("0xff <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000110 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000111 NUMBER '0xff' (1, 0) (1, 4)
112 OP '<=' (1, 5) (1, 7)
113 NUMBER '255' (1, 8) (1, 11)
Eric Smith74ca5572008-03-17 19:49:19 +0000114 >>> dump_tokens("0b10 <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000115 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000116 NUMBER '0b10' (1, 0) (1, 4)
117 OP '<=' (1, 5) (1, 7)
118 NUMBER '255' (1, 8) (1, 11)
119 >>> dump_tokens("0o123 <= 0O123")
Trent Nelson428de652008-03-18 22:41:35 +0000120 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000121 NUMBER '0o123' (1, 0) (1, 5)
122 OP '<=' (1, 6) (1, 8)
123 NUMBER '0O123' (1, 9) (1, 14)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000124 >>> dump_tokens("1234567 > ~0x15")
Trent Nelson428de652008-03-18 22:41:35 +0000125 ENCODING 'utf-8' (0, 0) (0, 0)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000126 NUMBER '1234567' (1, 0) (1, 7)
127 OP '>' (1, 8) (1, 9)
128 OP '~' (1, 10) (1, 11)
129 NUMBER '0x15' (1, 11) (1, 15)
130 >>> dump_tokens("2134568 != 1231515")
Trent Nelson428de652008-03-18 22:41:35 +0000131 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000132 NUMBER '2134568' (1, 0) (1, 7)
133 OP '!=' (1, 8) (1, 10)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000134 NUMBER '1231515' (1, 11) (1, 18)
135 >>> dump_tokens("(-124561-1) & 200000000")
Trent Nelson428de652008-03-18 22:41:35 +0000136 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000137 OP '(' (1, 0) (1, 1)
138 OP '-' (1, 1) (1, 2)
139 NUMBER '124561' (1, 2) (1, 8)
140 OP '-' (1, 8) (1, 9)
141 NUMBER '1' (1, 9) (1, 10)
142 OP ')' (1, 10) (1, 11)
143 OP '&' (1, 12) (1, 13)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000144 NUMBER '200000000' (1, 14) (1, 23)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000145 >>> dump_tokens("0xdeadbeef != -1")
Trent Nelson428de652008-03-18 22:41:35 +0000146 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000147 NUMBER '0xdeadbeef' (1, 0) (1, 10)
148 OP '!=' (1, 11) (1, 13)
149 OP '-' (1, 14) (1, 15)
150 NUMBER '1' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000151 >>> dump_tokens("0xdeadc0de & 12345")
Trent Nelson428de652008-03-18 22:41:35 +0000152 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000153 NUMBER '0xdeadc0de' (1, 0) (1, 10)
154 OP '&' (1, 11) (1, 12)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000155 NUMBER '12345' (1, 13) (1, 18)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000156 >>> dump_tokens("0xFF & 0x15 | 1234")
Trent Nelson428de652008-03-18 22:41:35 +0000157 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000158 NUMBER '0xFF' (1, 0) (1, 4)
159 OP '&' (1, 5) (1, 6)
160 NUMBER '0x15' (1, 7) (1, 11)
161 OP '|' (1, 12) (1, 13)
162 NUMBER '1234' (1, 14) (1, 18)
163
164Long integers
165
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000166 >>> dump_tokens("x = 0")
Trent Nelson428de652008-03-18 22:41:35 +0000167 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000168 NAME 'x' (1, 0) (1, 1)
169 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000170 NUMBER '0' (1, 4) (1, 5)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000171 >>> dump_tokens("x = 0xfffffffffff")
Trent Nelson428de652008-03-18 22:41:35 +0000172 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000173 NAME 'x' (1, 0) (1, 1)
174 OP '=' (1, 2) (1, 3)
175 NUMBER '0xffffffffff (1, 4) (1, 17)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000176 >>> dump_tokens("x = 123141242151251616110")
Trent Nelson428de652008-03-18 22:41:35 +0000177 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000178 NAME 'x' (1, 0) (1, 1)
179 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000180 NUMBER '123141242151 (1, 4) (1, 25)
181 >>> dump_tokens("x = -15921590215012591")
Trent Nelson428de652008-03-18 22:41:35 +0000182 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000183 NAME 'x' (1, 0) (1, 1)
184 OP '=' (1, 2) (1, 3)
185 OP '-' (1, 4) (1, 5)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000186 NUMBER '159215902150 (1, 5) (1, 22)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000187
188Floating point numbers
189
190 >>> dump_tokens("x = 3.14159")
Trent Nelson428de652008-03-18 22:41:35 +0000191 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000192 NAME 'x' (1, 0) (1, 1)
193 OP '=' (1, 2) (1, 3)
194 NUMBER '3.14159' (1, 4) (1, 11)
195 >>> dump_tokens("x = 314159.")
Trent Nelson428de652008-03-18 22:41:35 +0000196 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000197 NAME 'x' (1, 0) (1, 1)
198 OP '=' (1, 2) (1, 3)
199 NUMBER '314159.' (1, 4) (1, 11)
200 >>> dump_tokens("x = .314159")
Trent Nelson428de652008-03-18 22:41:35 +0000201 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000202 NAME 'x' (1, 0) (1, 1)
203 OP '=' (1, 2) (1, 3)
204 NUMBER '.314159' (1, 4) (1, 11)
205 >>> dump_tokens("x = 3e14159")
Trent Nelson428de652008-03-18 22:41:35 +0000206 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000207 NAME 'x' (1, 0) (1, 1)
208 OP '=' (1, 2) (1, 3)
209 NUMBER '3e14159' (1, 4) (1, 11)
210 >>> dump_tokens("x = 3E123")
Trent Nelson428de652008-03-18 22:41:35 +0000211 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000212 NAME 'x' (1, 0) (1, 1)
213 OP '=' (1, 2) (1, 3)
214 NUMBER '3E123' (1, 4) (1, 9)
215 >>> dump_tokens("x+y = 3e-1230")
Trent Nelson428de652008-03-18 22:41:35 +0000216 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000217 NAME 'x' (1, 0) (1, 1)
218 OP '+' (1, 1) (1, 2)
219 NAME 'y' (1, 2) (1, 3)
220 OP '=' (1, 4) (1, 5)
221 NUMBER '3e-1230' (1, 6) (1, 13)
222 >>> dump_tokens("x = 3.14e159")
Trent Nelson428de652008-03-18 22:41:35 +0000223 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000224 NAME 'x' (1, 0) (1, 1)
225 OP '=' (1, 2) (1, 3)
226 NUMBER '3.14e159' (1, 4) (1, 12)
227
228String literals
229
230 >>> dump_tokens("x = ''; y = \\\"\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000231 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000232 NAME 'x' (1, 0) (1, 1)
233 OP '=' (1, 2) (1, 3)
234 STRING "''" (1, 4) (1, 6)
235 OP ';' (1, 6) (1, 7)
236 NAME 'y' (1, 8) (1, 9)
237 OP '=' (1, 10) (1, 11)
238 STRING '""' (1, 12) (1, 14)
239 >>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000240 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000241 NAME 'x' (1, 0) (1, 1)
242 OP '=' (1, 2) (1, 3)
243 STRING '\\'"\\'' (1, 4) (1, 7)
244 OP ';' (1, 7) (1, 8)
245 NAME 'y' (1, 9) (1, 10)
246 OP '=' (1, 11) (1, 12)
247 STRING '"\\'"' (1, 13) (1, 16)
248 >>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000249 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000250 NAME 'x' (1, 0) (1, 1)
251 OP '=' (1, 2) (1, 3)
252 STRING '"doesn\\'t "' (1, 4) (1, 14)
253 NAME 'shrink' (1, 14) (1, 20)
254 STRING '", does it"' (1, 20) (1, 31)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000255 >>> dump_tokens("x = 'abc' + 'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000256 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000257 NAME 'x' (1, 0) (1, 1)
258 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000259 STRING "'abc'" (1, 4) (1, 9)
260 OP '+' (1, 10) (1, 11)
261 STRING "'ABC'" (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000262 >>> dump_tokens('y = "ABC" + "ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000263 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000264 NAME 'y' (1, 0) (1, 1)
265 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000266 STRING '"ABC"' (1, 4) (1, 9)
267 OP '+' (1, 10) (1, 11)
268 STRING '"ABC"' (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000269 >>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000270 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000271 NAME 'x' (1, 0) (1, 1)
272 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000273 STRING "r'abc'" (1, 4) (1, 10)
274 OP '+' (1, 11) (1, 12)
275 STRING "r'ABC'" (1, 13) (1, 19)
276 OP '+' (1, 20) (1, 21)
277 STRING "R'ABC'" (1, 22) (1, 28)
278 OP '+' (1, 29) (1, 30)
279 STRING "R'ABC'" (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000280 >>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000281 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000282 NAME 'y' (1, 0) (1, 1)
283 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000284 STRING 'r"abc"' (1, 4) (1, 10)
285 OP '+' (1, 11) (1, 12)
286 STRING 'r"ABC"' (1, 13) (1, 19)
287 OP '+' (1, 20) (1, 21)
288 STRING 'R"ABC"' (1, 22) (1, 28)
289 OP '+' (1, 29) (1, 30)
290 STRING 'R"ABC"' (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000291
292Operators
293
294 >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000295 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000296 NAME 'def' (1, 0) (1, 3)
297 NAME 'd22' (1, 4) (1, 7)
298 OP '(' (1, 7) (1, 8)
299 NAME 'a' (1, 8) (1, 9)
300 OP ',' (1, 9) (1, 10)
301 NAME 'b' (1, 11) (1, 12)
302 OP ',' (1, 12) (1, 13)
303 NAME 'c' (1, 14) (1, 15)
304 OP '=' (1, 15) (1, 16)
305 NUMBER '2' (1, 16) (1, 17)
306 OP ',' (1, 17) (1, 18)
307 NAME 'd' (1, 19) (1, 20)
308 OP '=' (1, 20) (1, 21)
309 NUMBER '2' (1, 21) (1, 22)
310 OP ',' (1, 22) (1, 23)
311 OP '*' (1, 24) (1, 25)
312 NAME 'k' (1, 25) (1, 26)
313 OP ')' (1, 26) (1, 27)
314 OP ':' (1, 27) (1, 28)
315 NAME 'pass' (1, 29) (1, 33)
316 >>> dump_tokens("def d01v_(a=1, *k, **w): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000317 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000318 NAME 'def' (1, 0) (1, 3)
319 NAME 'd01v_' (1, 4) (1, 9)
320 OP '(' (1, 9) (1, 10)
321 NAME 'a' (1, 10) (1, 11)
322 OP '=' (1, 11) (1, 12)
323 NUMBER '1' (1, 12) (1, 13)
324 OP ',' (1, 13) (1, 14)
325 OP '*' (1, 15) (1, 16)
326 NAME 'k' (1, 16) (1, 17)
327 OP ',' (1, 17) (1, 18)
328 OP '**' (1, 19) (1, 21)
329 NAME 'w' (1, 21) (1, 22)
330 OP ')' (1, 22) (1, 23)
331 OP ':' (1, 23) (1, 24)
332 NAME 'pass' (1, 25) (1, 29)
333
334Comparison
335
336 >>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +
337 ... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")
Trent Nelson428de652008-03-18 22:41:35 +0000338 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000339 NAME 'if' (1, 0) (1, 2)
340 NUMBER '1' (1, 3) (1, 4)
341 OP '<' (1, 5) (1, 6)
342 NUMBER '1' (1, 7) (1, 8)
343 OP '>' (1, 9) (1, 10)
344 NUMBER '1' (1, 11) (1, 12)
345 OP '==' (1, 13) (1, 15)
346 NUMBER '1' (1, 16) (1, 17)
347 OP '>=' (1, 18) (1, 20)
348 NUMBER '5' (1, 21) (1, 22)
349 OP '<=' (1, 23) (1, 25)
350 NUMBER '0x15' (1, 26) (1, 30)
351 OP '<=' (1, 31) (1, 33)
352 NUMBER '0x12' (1, 34) (1, 38)
353 OP '!=' (1, 39) (1, 41)
354 NUMBER '1' (1, 42) (1, 43)
355 NAME 'and' (1, 44) (1, 47)
356 NUMBER '5' (1, 48) (1, 49)
357 NAME 'in' (1, 50) (1, 52)
358 NUMBER '1' (1, 53) (1, 54)
359 NAME 'not' (1, 55) (1, 58)
360 NAME 'in' (1, 59) (1, 61)
361 NUMBER '1' (1, 62) (1, 63)
362 NAME 'is' (1, 64) (1, 66)
363 NUMBER '1' (1, 67) (1, 68)
364 NAME 'or' (1, 69) (1, 71)
365 NUMBER '5' (1, 72) (1, 73)
366 NAME 'is' (1, 74) (1, 76)
367 NAME 'not' (1, 77) (1, 80)
368 NUMBER '1' (1, 81) (1, 82)
369 OP ':' (1, 82) (1, 83)
370 NAME 'pass' (1, 84) (1, 88)
371
372Shift
373
374 >>> dump_tokens("x = 1 << 1 >> 5")
Trent Nelson428de652008-03-18 22:41:35 +0000375 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000376 NAME 'x' (1, 0) (1, 1)
377 OP '=' (1, 2) (1, 3)
378 NUMBER '1' (1, 4) (1, 5)
379 OP '<<' (1, 6) (1, 8)
380 NUMBER '1' (1, 9) (1, 10)
381 OP '>>' (1, 11) (1, 13)
382 NUMBER '5' (1, 14) (1, 15)
383
384Additive
385
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000386 >>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")
Trent Nelson428de652008-03-18 22:41:35 +0000387 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000388 NAME 'x' (1, 0) (1, 1)
389 OP '=' (1, 2) (1, 3)
390 NUMBER '1' (1, 4) (1, 5)
391 OP '-' (1, 6) (1, 7)
392 NAME 'y' (1, 8) (1, 9)
393 OP '+' (1, 10) (1, 11)
394 NUMBER '15' (1, 12) (1, 14)
395 OP '-' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000396 NUMBER '1' (1, 17) (1, 18)
397 OP '+' (1, 19) (1, 20)
398 NUMBER '0x124' (1, 21) (1, 26)
399 OP '+' (1, 27) (1, 28)
400 NAME 'z' (1, 29) (1, 30)
401 OP '+' (1, 31) (1, 32)
402 NAME 'a' (1, 33) (1, 34)
403 OP '[' (1, 34) (1, 35)
404 NUMBER '5' (1, 35) (1, 36)
405 OP ']' (1, 36) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000406
407Multiplicative
408
409 >>> dump_tokens("x = 1//1*1/5*12%0x12")
Trent Nelson428de652008-03-18 22:41:35 +0000410 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000411 NAME 'x' (1, 0) (1, 1)
412 OP '=' (1, 2) (1, 3)
413 NUMBER '1' (1, 4) (1, 5)
414 OP '//' (1, 5) (1, 7)
415 NUMBER '1' (1, 7) (1, 8)
416 OP '*' (1, 8) (1, 9)
417 NUMBER '1' (1, 9) (1, 10)
418 OP '/' (1, 10) (1, 11)
419 NUMBER '5' (1, 11) (1, 12)
420 OP '*' (1, 12) (1, 13)
421 NUMBER '12' (1, 13) (1, 15)
422 OP '%' (1, 15) (1, 16)
423 NUMBER '0x12' (1, 16) (1, 20)
424
425Unary
426
427 >>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")
Trent Nelson428de652008-03-18 22:41:35 +0000428 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000429 OP '~' (1, 0) (1, 1)
430 NUMBER '1' (1, 1) (1, 2)
431 OP '^' (1, 3) (1, 4)
432 NUMBER '1' (1, 5) (1, 6)
433 OP '&' (1, 7) (1, 8)
434 NUMBER '1' (1, 9) (1, 10)
435 OP '|' (1, 11) (1, 12)
436 NUMBER '1' (1, 12) (1, 13)
437 OP '^' (1, 14) (1, 15)
438 OP '-' (1, 16) (1, 17)
439 NUMBER '1' (1, 17) (1, 18)
440 >>> dump_tokens("-1*1/1+1*1//1 - ---1**1")
Trent Nelson428de652008-03-18 22:41:35 +0000441 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000442 OP '-' (1, 0) (1, 1)
443 NUMBER '1' (1, 1) (1, 2)
444 OP '*' (1, 2) (1, 3)
445 NUMBER '1' (1, 3) (1, 4)
446 OP '/' (1, 4) (1, 5)
447 NUMBER '1' (1, 5) (1, 6)
448 OP '+' (1, 6) (1, 7)
449 NUMBER '1' (1, 7) (1, 8)
450 OP '*' (1, 8) (1, 9)
451 NUMBER '1' (1, 9) (1, 10)
452 OP '//' (1, 10) (1, 12)
453 NUMBER '1' (1, 12) (1, 13)
454 OP '-' (1, 14) (1, 15)
455 OP '-' (1, 16) (1, 17)
456 OP '-' (1, 17) (1, 18)
457 OP '-' (1, 18) (1, 19)
458 NUMBER '1' (1, 19) (1, 20)
459 OP '**' (1, 20) (1, 22)
460 NUMBER '1' (1, 22) (1, 23)
461
462Selector
463
464 >>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")
Trent Nelson428de652008-03-18 22:41:35 +0000465 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000466 NAME 'import' (1, 0) (1, 6)
467 NAME 'sys' (1, 7) (1, 10)
468 OP ',' (1, 10) (1, 11)
469 NAME 'time' (1, 12) (1, 16)
470 NEWLINE '\\n' (1, 16) (1, 17)
471 NAME 'x' (2, 0) (2, 1)
472 OP '=' (2, 2) (2, 3)
473 NAME 'sys' (2, 4) (2, 7)
474 OP '.' (2, 7) (2, 8)
475 NAME 'modules' (2, 8) (2, 15)
476 OP '[' (2, 15) (2, 16)
477 STRING "'time'" (2, 16) (2, 22)
478 OP ']' (2, 22) (2, 23)
479 OP '.' (2, 23) (2, 24)
480 NAME 'time' (2, 24) (2, 28)
481 OP '(' (2, 28) (2, 29)
482 OP ')' (2, 29) (2, 30)
483
484Methods
485
486 >>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000487 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000488 OP '@' (1, 0) (1, 1)
489 NAME 'staticmethod (1, 1) (1, 13)
490 NEWLINE '\\n' (1, 13) (1, 14)
491 NAME 'def' (2, 0) (2, 3)
492 NAME 'foo' (2, 4) (2, 7)
493 OP '(' (2, 7) (2, 8)
494 NAME 'x' (2, 8) (2, 9)
495 OP ',' (2, 9) (2, 10)
496 NAME 'y' (2, 10) (2, 11)
497 OP ')' (2, 11) (2, 12)
498 OP ':' (2, 12) (2, 13)
499 NAME 'pass' (2, 14) (2, 18)
500
501Backslash means line continuation, except for comments
502
503 >>> roundtrip("x=1+\\\\n"
504 ... "1\\n"
505 ... "# This is a comment\\\\n"
506 ... "# This also\\n")
507 True
508 >>> roundtrip("# Comment \\\\nx = 0")
509 True
Christian Heimesba4af492008-03-28 00:55:15 +0000510
511Two string literals on the same line
512
513 >>> roundtrip("'' ''")
514 True
515
516Test roundtrip on random python modules.
Antoine Pitrou5bc4fa72010-10-14 15:34:31 +0000517pass the '-ucpu' option to process the full directory.
Christian Heimesba4af492008-03-28 00:55:15 +0000518
519 >>> import random
520 >>> tempdir = os.path.dirname(f) or os.curdir
521 >>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
522
Benjamin Peterson963e4022011-08-13 00:33:21 -0500523tokenize is broken on test_pep3131.py because regular expressions are broken on
524the obscure unicode identifiers in it. *sigh*
525 >>> testfiles.remove(os.path.join(tempdir, "test_pep3131.py"))
Antoine Pitrou5bc4fa72010-10-14 15:34:31 +0000526 >>> if not support.is_resource_enabled("cpu"):
Christian Heimesba4af492008-03-28 00:55:15 +0000527 ... testfiles = random.sample(testfiles, 10)
528 ...
529 >>> for testfile in testfiles:
530 ... if not roundtrip(open(testfile, 'rb')):
531 ... print("Roundtrip failed for file %s" % testfile)
532 ... break
533 ... else: True
534 True
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000535
536Evil tabs
Benjamin Peterson33856de2010-08-30 14:41:20 +0000537
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000538 >>> dump_tokens("def f():\\n\\tif x\\n \\tpass")
539 ENCODING 'utf-8' (0, 0) (0, 0)
540 NAME 'def' (1, 0) (1, 3)
541 NAME 'f' (1, 4) (1, 5)
542 OP '(' (1, 5) (1, 6)
543 OP ')' (1, 6) (1, 7)
544 OP ':' (1, 7) (1, 8)
545 NEWLINE '\\n' (1, 8) (1, 9)
546 INDENT '\\t' (2, 0) (2, 1)
547 NAME 'if' (2, 1) (2, 3)
548 NAME 'x' (2, 4) (2, 5)
549 NEWLINE '\\n' (2, 5) (2, 6)
550 INDENT ' \\t' (3, 0) (3, 9)
551 NAME 'pass' (3, 9) (3, 13)
552 DEDENT '' (4, 0) (4, 0)
553 DEDENT '' (4, 0) (4, 0)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000554
555Non-ascii identifiers
556
557 >>> dump_tokens("Örter = 'places'\\ngrün = 'green'")
558 ENCODING 'utf-8' (0, 0) (0, 0)
559 NAME 'Örter' (1, 0) (1, 5)
560 OP '=' (1, 6) (1, 7)
561 STRING "'places'" (1, 8) (1, 16)
562 NEWLINE '\\n' (1, 16) (1, 17)
563 NAME 'grün' (2, 0) (2, 4)
564 OP '=' (2, 5) (2, 6)
565 STRING "'green'" (2, 7) (2, 14)
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000566
567Legacy unicode literals:
568
569 >>> dump_tokens("Örter = u'places'\\ngrün = UR'green'")
570 ENCODING 'utf-8' (0, 0) (0, 0)
571 NAME 'Örter' (1, 0) (1, 5)
572 OP '=' (1, 6) (1, 7)
573 STRING "u'places'" (1, 8) (1, 17)
574 NEWLINE '\\n' (1, 17) (1, 18)
575 NAME 'grün' (2, 0) (2, 4)
576 OP '=' (2, 5) (2, 6)
577 STRING "UR'green'" (2, 7) (2, 16)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000578"""
579
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000580from test import support
Trent Nelson428de652008-03-18 22:41:35 +0000581from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
Meador Inge00c7f852012-01-19 00:44:45 -0600582 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
Victor Stinner58c07522010-11-09 01:08:59 +0000583 open as tokenize_open)
Trent Nelson428de652008-03-18 22:41:35 +0000584from io import BytesIO
585from unittest import TestCase
586import os, sys, glob
Meador Inge00c7f852012-01-19 00:44:45 -0600587import token
Raymond Hettinger68c04532005-06-10 11:05:19 +0000588
Thomas Wouters89f507f2006-12-13 04:49:30 +0000589def dump_tokens(s):
590 """Print out the tokens in s in a table format.
591
592 The ENDMARKER is omitted.
593 """
Trent Nelson428de652008-03-18 22:41:35 +0000594 f = BytesIO(s.encode('utf-8'))
595 for type, token, start, end, line in tokenize(f.readline):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000596 if type == ENDMARKER:
597 break
598 type = tok_name[type]
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000599 print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters89f507f2006-12-13 04:49:30 +0000600
Trent Nelson428de652008-03-18 22:41:35 +0000601def roundtrip(f):
602 """
603 Test roundtrip for `untokenize`. `f` is an open file or a string.
604 The source code in f is tokenized, converted back to source code via
605 tokenize.untokenize(), and tokenized again from the latter. The test
606 fails if the second tokenization doesn't match the first.
607 """
608 if isinstance(f, str):
609 f = BytesIO(f.encode('utf-8'))
Brian Curtin9f5f65c2010-10-30 21:35:28 +0000610 try:
611 token_list = list(tokenize(f.readline))
612 finally:
613 f.close()
Trent Nelson428de652008-03-18 22:41:35 +0000614 tokens1 = [tok[:2] for tok in token_list]
615 new_bytes = untokenize(tokens1)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300616 readline = (line for line in new_bytes.splitlines(keepends=True)).__next__
Trent Nelson428de652008-03-18 22:41:35 +0000617 tokens2 = [tok[:2] for tok in tokenize(readline)]
618 return tokens1 == tokens2
Thomas Wouters89f507f2006-12-13 04:49:30 +0000619
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000620# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000621def decistmt(s):
622 """Substitute Decimals for floats in a string of statements.
623
624 >>> from decimal import Decimal
Georg Brandl88fc6642007-02-09 21:28:07 +0000625 >>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000626 >>> decistmt(s)
Georg Brandl88fc6642007-02-09 21:28:07 +0000627 "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger68c04532005-06-10 11:05:19 +0000628
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000629 The format of the exponent is inherited from the platform C library.
630 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
Mark Dickinson388122d2010-08-04 20:56:28 +0000631 we're only showing 11 digits, and the 12th isn't close to 5, the
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000632 rest of the output should be platform-independent.
633
634 >>> exec(s) #doctest: +ELLIPSIS
Mark Dickinson388122d2010-08-04 20:56:28 +0000635 -3.2171603427...e-0...7
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000636
637 Output from calculations with Decimal should be identical across all
638 platforms.
639
Raymond Hettinger68c04532005-06-10 11:05:19 +0000640 >>> exec(decistmt(s))
641 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +0000642 """
643 result = []
Trent Nelson428de652008-03-18 22:41:35 +0000644 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
Raymond Hettinger68c04532005-06-10 11:05:19 +0000645 for toknum, tokval, _, _, _ in g:
646 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
647 result.extend([
648 (NAME, 'Decimal'),
649 (OP, '('),
650 (STRING, repr(tokval)),
651 (OP, ')')
652 ])
653 else:
654 result.append((toknum, tokval))
Trent Nelson428de652008-03-18 22:41:35 +0000655 return untokenize(result).decode('utf-8')
656
657
658class TestTokenizerAdheresToPep0263(TestCase):
659 """
660 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
661 """
662
663 def _testFile(self, filename):
664 path = os.path.join(os.path.dirname(__file__), filename)
665 return roundtrip(open(path, 'rb'))
666
667 def test_utf8_coding_cookie_and_no_utf8_bom(self):
Ned Deily2ea6fcc2011-07-19 16:15:27 -0700668 f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
Trent Nelson428de652008-03-18 22:41:35 +0000669 self.assertTrue(self._testFile(f))
670
671 def test_latin1_coding_cookie_and_utf8_bom(self):
672 """
673 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
674 allowed encoding for the comment is 'utf-8'. The text file used in
675 this test starts with a BOM signature, but specifies latin1 as the
676 coding, so verify that a SyntaxError is raised, which matches the
677 behaviour of the interpreter when it encounters a similar condition.
678 """
679 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000680 self.assertRaises(SyntaxError, self._testFile, f)
Trent Nelson428de652008-03-18 22:41:35 +0000681
682 def test_no_coding_cookie_and_utf8_bom(self):
683 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
684 self.assertTrue(self._testFile(f))
685
686 def test_utf8_coding_cookie_and_utf8_bom(self):
687 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
688 self.assertTrue(self._testFile(f))
689
690
691class Test_Tokenize(TestCase):
692
693 def test__tokenize_decodes_with_specified_encoding(self):
694 literal = '"ЉЊЈЁЂ"'
695 line = literal.encode('utf-8')
696 first = False
697 def readline():
698 nonlocal first
699 if not first:
700 first = True
701 return line
702 else:
703 return b''
704
705 # skip the initial encoding token and the end token
706 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
707 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000708 self.assertEqual(tokens, expected_tokens,
709 "bytes not decoded with encoding")
Trent Nelson428de652008-03-18 22:41:35 +0000710
711 def test__tokenize_does_not_decode_with_encoding_none(self):
712 literal = '"ЉЊЈЁЂ"'
713 first = False
714 def readline():
715 nonlocal first
716 if not first:
717 first = True
718 return literal
719 else:
720 return b''
721
722 # skip the end token
723 tokens = list(_tokenize(readline, encoding=None))[:-1]
724 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000725 self.assertEqual(tokens, expected_tokens,
726 "string not tokenized when encoding is None")
Trent Nelson428de652008-03-18 22:41:35 +0000727
728
729class TestDetectEncoding(TestCase):
730
731 def get_readline(self, lines):
732 index = 0
733 def readline():
734 nonlocal index
735 if index == len(lines):
736 raise StopIteration
737 line = lines[index]
738 index += 1
739 return line
740 return readline
741
742 def test_no_bom_no_encoding_cookie(self):
743 lines = (
744 b'# something\n',
745 b'print(something)\n',
746 b'do_something(else)\n'
747 )
748 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000749 self.assertEqual(encoding, 'utf-8')
750 self.assertEqual(consumed_lines, list(lines[:2]))
Trent Nelson428de652008-03-18 22:41:35 +0000751
752 def test_bom_no_cookie(self):
753 lines = (
754 b'\xef\xbb\xbf# something\n',
755 b'print(something)\n',
756 b'do_something(else)\n'
757 )
758 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000759 self.assertEqual(encoding, 'utf-8-sig')
760 self.assertEqual(consumed_lines,
761 [b'# something\n', b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000762
763 def test_cookie_first_line_no_bom(self):
764 lines = (
765 b'# -*- coding: latin-1 -*-\n',
766 b'print(something)\n',
767 b'do_something(else)\n'
768 )
769 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000770 self.assertEqual(encoding, 'iso-8859-1')
771 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000772
773 def test_matched_bom_and_cookie_first_line(self):
774 lines = (
775 b'\xef\xbb\xbf# coding=utf-8\n',
776 b'print(something)\n',
777 b'do_something(else)\n'
778 )
779 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000780 self.assertEqual(encoding, 'utf-8-sig')
781 self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000782
783 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
784 lines = (
785 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
786 b'print(something)\n',
787 b'do_something(else)\n'
788 )
789 readline = self.get_readline(lines)
790 self.assertRaises(SyntaxError, detect_encoding, readline)
791
792 def test_cookie_second_line_no_bom(self):
793 lines = (
794 b'#! something\n',
795 b'# vim: set fileencoding=ascii :\n',
796 b'print(something)\n',
797 b'do_something(else)\n'
798 )
799 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000800 self.assertEqual(encoding, 'ascii')
Trent Nelson428de652008-03-18 22:41:35 +0000801 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
Ezio Melottib3aedd42010-11-20 19:04:17 +0000802 self.assertEqual(consumed_lines, expected)
Trent Nelson428de652008-03-18 22:41:35 +0000803
804 def test_matched_bom_and_cookie_second_line(self):
805 lines = (
806 b'\xef\xbb\xbf#! something\n',
807 b'f# coding=utf-8\n',
808 b'print(something)\n',
809 b'do_something(else)\n'
810 )
811 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000812 self.assertEqual(encoding, 'utf-8-sig')
813 self.assertEqual(consumed_lines,
814 [b'#! something\n', b'f# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000815
816 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
817 lines = (
818 b'\xef\xbb\xbf#! something\n',
819 b'# vim: set fileencoding=ascii :\n',
820 b'print(something)\n',
821 b'do_something(else)\n'
822 )
823 readline = self.get_readline(lines)
824 self.assertRaises(SyntaxError, detect_encoding, readline)
825
Benjamin Petersond3afada2009-10-09 21:43:09 +0000826 def test_latin1_normalization(self):
827 # See get_normal_name() in tokenizer.c.
828 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
829 "iso-8859-1-unix", "iso-latin-1-mac")
830 for encoding in encodings:
831 for rep in ("-", "_"):
832 enc = encoding.replace("-", rep)
833 lines = (b"#!/usr/bin/python\n",
834 b"# coding: " + enc.encode("ascii") + b"\n",
835 b"print(things)\n",
836 b"do_something += 4\n")
837 rl = self.get_readline(lines)
838 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000839 self.assertEqual(found, "iso-8859-1")
Benjamin Petersond3afada2009-10-09 21:43:09 +0000840
841 def test_utf8_normalization(self):
842 # See get_normal_name() in tokenizer.c.
843 encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
844 for encoding in encodings:
845 for rep in ("-", "_"):
846 enc = encoding.replace("-", rep)
847 lines = (b"#!/usr/bin/python\n",
848 b"# coding: " + enc.encode("ascii") + b"\n",
849 b"1 + 3\n")
850 rl = self.get_readline(lines)
851 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000852 self.assertEqual(found, "utf-8")
Benjamin Petersond3afada2009-10-09 21:43:09 +0000853
Trent Nelson428de652008-03-18 22:41:35 +0000854 def test_short_files(self):
855 readline = self.get_readline((b'print(something)\n',))
856 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000857 self.assertEqual(encoding, 'utf-8')
858 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000859
860 encoding, consumed_lines = detect_encoding(self.get_readline(()))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000861 self.assertEqual(encoding, 'utf-8')
862 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +0000863
864 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
865 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000866 self.assertEqual(encoding, 'utf-8-sig')
867 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000868
869 readline = self.get_readline((b'\xef\xbb\xbf',))
870 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000871 self.assertEqual(encoding, 'utf-8-sig')
872 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +0000873
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000874 readline = self.get_readline((b'# coding: bad\n',))
875 self.assertRaises(SyntaxError, detect_encoding, readline)
Trent Nelson428de652008-03-18 22:41:35 +0000876
Victor Stinner58c07522010-11-09 01:08:59 +0000877 def test_open(self):
878 filename = support.TESTFN + '.py'
879 self.addCleanup(support.unlink, filename)
880
881 # test coding cookie
882 for encoding in ('iso-8859-15', 'utf-8'):
883 with open(filename, 'w', encoding=encoding) as fp:
884 print("# coding: %s" % encoding, file=fp)
885 print("print('euro:\u20ac')", file=fp)
886 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +0000887 self.assertEqual(fp.encoding, encoding)
888 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +0000889
890 # test BOM (no coding cookie)
891 with open(filename, 'w', encoding='utf-8-sig') as fp:
892 print("print('euro:\u20ac')", file=fp)
893 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +0000894 self.assertEqual(fp.encoding, 'utf-8-sig')
895 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +0000896
Trent Nelson428de652008-03-18 22:41:35 +0000897class TestTokenize(TestCase):
898
899 def test_tokenize(self):
900 import tokenize as tokenize_module
901 encoding = object()
902 encoding_used = None
903 def mock_detect_encoding(readline):
904 return encoding, ['first', 'second']
905
906 def mock__tokenize(readline, encoding):
907 nonlocal encoding_used
908 encoding_used = encoding
909 out = []
910 while True:
911 next_line = readline()
912 if next_line:
913 out.append(next_line)
914 continue
915 return out
916
917 counter = 0
918 def mock_readline():
919 nonlocal counter
920 counter += 1
921 if counter == 5:
922 return b''
923 return counter
924
925 orig_detect_encoding = tokenize_module.detect_encoding
926 orig__tokenize = tokenize_module._tokenize
927 tokenize_module.detect_encoding = mock_detect_encoding
928 tokenize_module._tokenize = mock__tokenize
929 try:
930 results = tokenize(mock_readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000931 self.assertEqual(list(results), ['first', 'second', 1, 2, 3, 4])
Trent Nelson428de652008-03-18 22:41:35 +0000932 finally:
933 tokenize_module.detect_encoding = orig_detect_encoding
934 tokenize_module._tokenize = orig__tokenize
935
936 self.assertTrue(encoding_used, encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000937
Meador Inge00c7f852012-01-19 00:44:45 -0600938 def assertExactTypeEqual(self, opstr, *optypes):
939 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
940 num_optypes = len(optypes)
941 self.assertEqual(len(tokens), 2 + num_optypes)
942 self.assertEqual(token.tok_name[tokens[0].exact_type],
943 token.tok_name[ENCODING])
944 for i in range(num_optypes):
945 self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
946 token.tok_name[optypes[i]])
947 self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
948 token.tok_name[token.ENDMARKER])
949
950 def test_exact_type(self):
951 self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
952 self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
953 self.assertExactTypeEqual(':', token.COLON)
954 self.assertExactTypeEqual(',', token.COMMA)
955 self.assertExactTypeEqual(';', token.SEMI)
956 self.assertExactTypeEqual('+', token.PLUS)
957 self.assertExactTypeEqual('-', token.MINUS)
958 self.assertExactTypeEqual('*', token.STAR)
959 self.assertExactTypeEqual('/', token.SLASH)
960 self.assertExactTypeEqual('|', token.VBAR)
961 self.assertExactTypeEqual('&', token.AMPER)
962 self.assertExactTypeEqual('<', token.LESS)
963 self.assertExactTypeEqual('>', token.GREATER)
964 self.assertExactTypeEqual('=', token.EQUAL)
965 self.assertExactTypeEqual('.', token.DOT)
966 self.assertExactTypeEqual('%', token.PERCENT)
967 self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
968 self.assertExactTypeEqual('==', token.EQEQUAL)
969 self.assertExactTypeEqual('!=', token.NOTEQUAL)
970 self.assertExactTypeEqual('<=', token.LESSEQUAL)
971 self.assertExactTypeEqual('>=', token.GREATEREQUAL)
972 self.assertExactTypeEqual('~', token.TILDE)
973 self.assertExactTypeEqual('^', token.CIRCUMFLEX)
974 self.assertExactTypeEqual('<<', token.LEFTSHIFT)
975 self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
976 self.assertExactTypeEqual('**', token.DOUBLESTAR)
977 self.assertExactTypeEqual('+=', token.PLUSEQUAL)
978 self.assertExactTypeEqual('-=', token.MINEQUAL)
979 self.assertExactTypeEqual('*=', token.STAREQUAL)
980 self.assertExactTypeEqual('/=', token.SLASHEQUAL)
981 self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
982 self.assertExactTypeEqual('&=', token.AMPEREQUAL)
983 self.assertExactTypeEqual('|=', token.VBAREQUAL)
984 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
985 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
986 self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
987 self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
988 self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
989 self.assertExactTypeEqual('//', token.DOUBLESLASH)
990 self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
991 self.assertExactTypeEqual('@', token.AT)
992
993 self.assertExactTypeEqual('a**2+b**2==c**2',
994 NAME, token.DOUBLESTAR, NUMBER,
995 token.PLUS,
996 NAME, token.DOUBLESTAR, NUMBER,
997 token.EQEQUAL,
998 NAME, token.DOUBLESTAR, NUMBER)
999 self.assertExactTypeEqual('{1, 2, 3}',
1000 token.LBRACE,
1001 token.NUMBER, token.COMMA,
1002 token.NUMBER, token.COMMA,
1003 token.NUMBER,
1004 token.RBRACE)
1005 self.assertExactTypeEqual('^(x & 0x1)',
1006 token.CIRCUMFLEX,
1007 token.LPAR,
1008 token.NAME, token.AMPER, token.NUMBER,
1009 token.RPAR)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001010
1011__test__ = {"doctests" : doctests, 'decistmt': decistmt}
1012
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001013def test_main():
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001014 from test import test_tokenize
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001015 support.run_doctest(test_tokenize, True)
1016 support.run_unittest(TestTokenizerAdheresToPep0263)
1017 support.run_unittest(Test_Tokenize)
1018 support.run_unittest(TestDetectEncoding)
1019 support.run_unittest(TestTokenize)
Neal Norwitzc1505362006-12-28 06:47:50 +00001020
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001021if __name__ == "__main__":
1022 test_main()