blob: 4e798d789f6419e6814a4f9ff61c9a959c665bfc [file] [log] [blame]
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001doctests = """
2Tests for the tokenize module.
Thomas Wouters89f507f2006-12-13 04:49:30 +00003
Christian Heimesdd15f6c2008-03-16 00:07:10 +00004The tests can be really simple. Given a small fragment of source
Eric Smith74ca5572008-03-17 19:49:19 +00005code, print out a table with tokens. The ENDMARK is omitted for
Thomas Wouters89f507f2006-12-13 04:49:30 +00006brevity.
7
Christian Heimesdd15f6c2008-03-16 00:07:10 +00008 >>> dump_tokens("1 + 1")
Trent Nelson428de652008-03-18 22:41:35 +00009 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010 NUMBER '1' (1, 0) (1, 1)
11 OP '+' (1, 2) (1, 3)
12 NUMBER '1' (1, 4) (1, 5)
Thomas Wouters89f507f2006-12-13 04:49:30 +000013
Christian Heimesdd15f6c2008-03-16 00:07:10 +000014 >>> dump_tokens("if False:\\n"
15 ... " # NL\\n"
16 ... " True = False # NEWLINE\\n")
Trent Nelson428de652008-03-18 22:41:35 +000017 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000018 NAME 'if' (1, 0) (1, 2)
19 NAME 'False' (1, 3) (1, 8)
20 OP ':' (1, 8) (1, 9)
21 NEWLINE '\\n' (1, 9) (1, 10)
22 COMMENT '# NL' (2, 4) (2, 8)
23 NL '\\n' (2, 8) (2, 9)
24 INDENT ' ' (3, 0) (3, 4)
25 NAME 'True' (3, 4) (3, 8)
26 OP '=' (3, 9) (3, 10)
27 NAME 'False' (3, 11) (3, 16)
28 COMMENT '# NEWLINE' (3, 17) (3, 26)
29 NEWLINE '\\n' (3, 26) (3, 27)
30 DEDENT '' (4, 0) (4, 0)
Thomas Wouters89f507f2006-12-13 04:49:30 +000031
Christian Heimesdd15f6c2008-03-16 00:07:10 +000032 >>> indent_error_file = \"""
33 ... def k(x):
34 ... x += 2
35 ... x += 5
36 ... \"""
Trent Nelson428de652008-03-18 22:41:35 +000037 >>> readline = BytesIO(indent_error_file.encode('utf-8')).readline
38 >>> for tok in tokenize(readline): pass
Christian Heimesdd15f6c2008-03-16 00:07:10 +000039 Traceback (most recent call last):
40 ...
41 IndentationError: unindent does not match any outer indentation level
Thomas Wouters89f507f2006-12-13 04:49:30 +000042
Mark Dickinson3c0b3172010-06-29 07:38:37 +000043There are some standard formatting practices that are easy to get right.
Thomas Wouters89f507f2006-12-13 04:49:30 +000044
Christian Heimesdd15f6c2008-03-16 00:07:10 +000045 >>> roundtrip("if x == 1:\\n"
46 ... " print(x)\\n")
47 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000048
Christian Heimesdd15f6c2008-03-16 00:07:10 +000049 >>> roundtrip("# This is a comment\\n# This also")
50 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000051
52Some people use different formatting conventions, which makes
Christian Heimesdd15f6c2008-03-16 00:07:10 +000053untokenize a little trickier. Note that this test involves trailing
54whitespace after the colon. Note that we use hex escapes to make the
Trent Nelson428de652008-03-18 22:41:35 +000055two trailing blanks apparent in the expected output.
Thomas Wouters89f507f2006-12-13 04:49:30 +000056
Christian Heimesdd15f6c2008-03-16 00:07:10 +000057 >>> roundtrip("if x == 1 : \\n"
58 ... " print(x)\\n")
59 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000060
Benjamin Petersonee8712c2008-05-20 21:35:26 +000061 >>> f = support.findfile("tokenize_tests.txt")
Trent Nelson428de652008-03-18 22:41:35 +000062 >>> roundtrip(open(f, 'rb'))
Christian Heimesdd15f6c2008-03-16 00:07:10 +000063 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000064
Christian Heimesdd15f6c2008-03-16 00:07:10 +000065 >>> roundtrip("if x == 1:\\n"
66 ... " # A comment by itself.\\n"
67 ... " print(x) # Comment here, too.\\n"
68 ... " # Another comment.\\n"
69 ... "after_if = True\\n")
70 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000071
Christian Heimesdd15f6c2008-03-16 00:07:10 +000072 >>> roundtrip("if (x # The comments need to go in the right place\\n"
73 ... " == 1):\\n"
74 ... " print('x==1')\\n")
75 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000076
Christian Heimesdd15f6c2008-03-16 00:07:10 +000077 >>> roundtrip("class Test: # A comment here\\n"
78 ... " # A comment with weird indent\\n"
79 ... " after_com = 5\\n"
80 ... " def x(m): return m*5 # a one liner\\n"
81 ... " def y(m): # A whitespace after the colon\\n"
82 ... " return y*4 # 3-space indent\\n")
83 True
84
85Some error-handling code
86
87 >>> roundtrip("try: import somemodule\\n"
88 ... "except ImportError: # comment\\n"
Christian Heimesba4af492008-03-28 00:55:15 +000089 ... " print('Can not import' # comment2\\n)"
Neal Norwitz752abd02008-05-13 04:55:24 +000090 ... "else: print('Loaded')\\n")
Christian Heimesdd15f6c2008-03-16 00:07:10 +000091 True
92
Eric Smith74ca5572008-03-17 19:49:19 +000093Balancing continuation
Christian Heimesdd15f6c2008-03-16 00:07:10 +000094
95 >>> roundtrip("a = (3,4, \\n"
96 ... "5,6)\\n"
97 ... "y = [3, 4,\\n"
98 ... "5]\\n"
99 ... "z = {'a': 5,\\n"
100 ... "'b':15, 'c':True}\\n"
101 ... "x = len(y) + 5 - a[\\n"
102 ... "3] - a[2]\\n"
103 ... "+ len(z) - z[\\n"
104 ... "'b']\\n")
105 True
106
107Ordinary integers and binary operators
108
109 >>> dump_tokens("0xff <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000110 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000111 NUMBER '0xff' (1, 0) (1, 4)
112 OP '<=' (1, 5) (1, 7)
113 NUMBER '255' (1, 8) (1, 11)
Eric Smith74ca5572008-03-17 19:49:19 +0000114 >>> dump_tokens("0b10 <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000115 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000116 NUMBER '0b10' (1, 0) (1, 4)
117 OP '<=' (1, 5) (1, 7)
118 NUMBER '255' (1, 8) (1, 11)
119 >>> dump_tokens("0o123 <= 0O123")
Trent Nelson428de652008-03-18 22:41:35 +0000120 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000121 NUMBER '0o123' (1, 0) (1, 5)
122 OP '<=' (1, 6) (1, 8)
123 NUMBER '0O123' (1, 9) (1, 14)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000124 >>> dump_tokens("1234567 > ~0x15")
Trent Nelson428de652008-03-18 22:41:35 +0000125 ENCODING 'utf-8' (0, 0) (0, 0)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000126 NUMBER '1234567' (1, 0) (1, 7)
127 OP '>' (1, 8) (1, 9)
128 OP '~' (1, 10) (1, 11)
129 NUMBER '0x15' (1, 11) (1, 15)
130 >>> dump_tokens("2134568 != 1231515")
Trent Nelson428de652008-03-18 22:41:35 +0000131 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000132 NUMBER '2134568' (1, 0) (1, 7)
133 OP '!=' (1, 8) (1, 10)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000134 NUMBER '1231515' (1, 11) (1, 18)
135 >>> dump_tokens("(-124561-1) & 200000000")
Trent Nelson428de652008-03-18 22:41:35 +0000136 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000137 OP '(' (1, 0) (1, 1)
138 OP '-' (1, 1) (1, 2)
139 NUMBER '124561' (1, 2) (1, 8)
140 OP '-' (1, 8) (1, 9)
141 NUMBER '1' (1, 9) (1, 10)
142 OP ')' (1, 10) (1, 11)
143 OP '&' (1, 12) (1, 13)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000144 NUMBER '200000000' (1, 14) (1, 23)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000145 >>> dump_tokens("0xdeadbeef != -1")
Trent Nelson428de652008-03-18 22:41:35 +0000146 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000147 NUMBER '0xdeadbeef' (1, 0) (1, 10)
148 OP '!=' (1, 11) (1, 13)
149 OP '-' (1, 14) (1, 15)
150 NUMBER '1' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000151 >>> dump_tokens("0xdeadc0de & 12345")
Trent Nelson428de652008-03-18 22:41:35 +0000152 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000153 NUMBER '0xdeadc0de' (1, 0) (1, 10)
154 OP '&' (1, 11) (1, 12)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000155 NUMBER '12345' (1, 13) (1, 18)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000156 >>> dump_tokens("0xFF & 0x15 | 1234")
Trent Nelson428de652008-03-18 22:41:35 +0000157 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000158 NUMBER '0xFF' (1, 0) (1, 4)
159 OP '&' (1, 5) (1, 6)
160 NUMBER '0x15' (1, 7) (1, 11)
161 OP '|' (1, 12) (1, 13)
162 NUMBER '1234' (1, 14) (1, 18)
163
164Long integers
165
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000166 >>> dump_tokens("x = 0")
Trent Nelson428de652008-03-18 22:41:35 +0000167 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000168 NAME 'x' (1, 0) (1, 1)
169 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000170 NUMBER '0' (1, 4) (1, 5)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000171 >>> dump_tokens("x = 0xfffffffffff")
Trent Nelson428de652008-03-18 22:41:35 +0000172 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000173 NAME 'x' (1, 0) (1, 1)
174 OP '=' (1, 2) (1, 3)
175 NUMBER '0xffffffffff (1, 4) (1, 17)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000176 >>> dump_tokens("x = 123141242151251616110")
Trent Nelson428de652008-03-18 22:41:35 +0000177 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000178 NAME 'x' (1, 0) (1, 1)
179 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000180 NUMBER '123141242151 (1, 4) (1, 25)
181 >>> dump_tokens("x = -15921590215012591")
Trent Nelson428de652008-03-18 22:41:35 +0000182 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000183 NAME 'x' (1, 0) (1, 1)
184 OP '=' (1, 2) (1, 3)
185 OP '-' (1, 4) (1, 5)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000186 NUMBER '159215902150 (1, 5) (1, 22)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000187
188Floating point numbers
189
190 >>> dump_tokens("x = 3.14159")
Trent Nelson428de652008-03-18 22:41:35 +0000191 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000192 NAME 'x' (1, 0) (1, 1)
193 OP '=' (1, 2) (1, 3)
194 NUMBER '3.14159' (1, 4) (1, 11)
195 >>> dump_tokens("x = 314159.")
Trent Nelson428de652008-03-18 22:41:35 +0000196 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000197 NAME 'x' (1, 0) (1, 1)
198 OP '=' (1, 2) (1, 3)
199 NUMBER '314159.' (1, 4) (1, 11)
200 >>> dump_tokens("x = .314159")
Trent Nelson428de652008-03-18 22:41:35 +0000201 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000202 NAME 'x' (1, 0) (1, 1)
203 OP '=' (1, 2) (1, 3)
204 NUMBER '.314159' (1, 4) (1, 11)
205 >>> dump_tokens("x = 3e14159")
Trent Nelson428de652008-03-18 22:41:35 +0000206 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000207 NAME 'x' (1, 0) (1, 1)
208 OP '=' (1, 2) (1, 3)
209 NUMBER '3e14159' (1, 4) (1, 11)
210 >>> dump_tokens("x = 3E123")
Trent Nelson428de652008-03-18 22:41:35 +0000211 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000212 NAME 'x' (1, 0) (1, 1)
213 OP '=' (1, 2) (1, 3)
214 NUMBER '3E123' (1, 4) (1, 9)
215 >>> dump_tokens("x+y = 3e-1230")
Trent Nelson428de652008-03-18 22:41:35 +0000216 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000217 NAME 'x' (1, 0) (1, 1)
218 OP '+' (1, 1) (1, 2)
219 NAME 'y' (1, 2) (1, 3)
220 OP '=' (1, 4) (1, 5)
221 NUMBER '3e-1230' (1, 6) (1, 13)
222 >>> dump_tokens("x = 3.14e159")
Trent Nelson428de652008-03-18 22:41:35 +0000223 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000224 NAME 'x' (1, 0) (1, 1)
225 OP '=' (1, 2) (1, 3)
226 NUMBER '3.14e159' (1, 4) (1, 12)
227
228String literals
229
230 >>> dump_tokens("x = ''; y = \\\"\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000231 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000232 NAME 'x' (1, 0) (1, 1)
233 OP '=' (1, 2) (1, 3)
234 STRING "''" (1, 4) (1, 6)
235 OP ';' (1, 6) (1, 7)
236 NAME 'y' (1, 8) (1, 9)
237 OP '=' (1, 10) (1, 11)
238 STRING '""' (1, 12) (1, 14)
239 >>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000240 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000241 NAME 'x' (1, 0) (1, 1)
242 OP '=' (1, 2) (1, 3)
243 STRING '\\'"\\'' (1, 4) (1, 7)
244 OP ';' (1, 7) (1, 8)
245 NAME 'y' (1, 9) (1, 10)
246 OP '=' (1, 11) (1, 12)
247 STRING '"\\'"' (1, 13) (1, 16)
248 >>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000249 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000250 NAME 'x' (1, 0) (1, 1)
251 OP '=' (1, 2) (1, 3)
252 STRING '"doesn\\'t "' (1, 4) (1, 14)
253 NAME 'shrink' (1, 14) (1, 20)
254 STRING '", does it"' (1, 20) (1, 31)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000255 >>> dump_tokens("x = 'abc' + 'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000256 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000257 NAME 'x' (1, 0) (1, 1)
258 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000259 STRING "'abc'" (1, 4) (1, 9)
260 OP '+' (1, 10) (1, 11)
261 STRING "'ABC'" (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000262 >>> dump_tokens('y = "ABC" + "ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000263 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000264 NAME 'y' (1, 0) (1, 1)
265 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000266 STRING '"ABC"' (1, 4) (1, 9)
267 OP '+' (1, 10) (1, 11)
268 STRING '"ABC"' (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000269 >>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000270 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000271 NAME 'x' (1, 0) (1, 1)
272 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000273 STRING "r'abc'" (1, 4) (1, 10)
274 OP '+' (1, 11) (1, 12)
275 STRING "r'ABC'" (1, 13) (1, 19)
276 OP '+' (1, 20) (1, 21)
277 STRING "R'ABC'" (1, 22) (1, 28)
278 OP '+' (1, 29) (1, 30)
279 STRING "R'ABC'" (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000280 >>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000281 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000282 NAME 'y' (1, 0) (1, 1)
283 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000284 STRING 'r"abc"' (1, 4) (1, 10)
285 OP '+' (1, 11) (1, 12)
286 STRING 'r"ABC"' (1, 13) (1, 19)
287 OP '+' (1, 20) (1, 21)
288 STRING 'R"ABC"' (1, 22) (1, 28)
289 OP '+' (1, 29) (1, 30)
290 STRING 'R"ABC"' (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000291
Meador Inge8d5c0b82012-06-16 21:49:08 -0500292 >>> dump_tokens("u'abc' + U'abc'")
293 ENCODING 'utf-8' (0, 0) (0, 0)
294 STRING "u'abc'" (1, 0) (1, 6)
295 OP '+' (1, 7) (1, 8)
296 STRING "U'abc'" (1, 9) (1, 15)
297 >>> dump_tokens('u"abc" + U"abc"')
298 ENCODING 'utf-8' (0, 0) (0, 0)
299 STRING 'u"abc"' (1, 0) (1, 6)
300 OP '+' (1, 7) (1, 8)
301 STRING 'U"abc"' (1, 9) (1, 15)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500302
303 >>> dump_tokens("b'abc' + B'abc'")
304 ENCODING 'utf-8' (0, 0) (0, 0)
305 STRING "b'abc'" (1, 0) (1, 6)
306 OP '+' (1, 7) (1, 8)
307 STRING "B'abc'" (1, 9) (1, 15)
308 >>> dump_tokens('b"abc" + B"abc"')
309 ENCODING 'utf-8' (0, 0) (0, 0)
310 STRING 'b"abc"' (1, 0) (1, 6)
311 OP '+' (1, 7) (1, 8)
312 STRING 'B"abc"' (1, 9) (1, 15)
313 >>> dump_tokens("br'abc' + bR'abc' + Br'abc' + BR'abc'")
314 ENCODING 'utf-8' (0, 0) (0, 0)
315 STRING "br'abc'" (1, 0) (1, 7)
316 OP '+' (1, 8) (1, 9)
317 STRING "bR'abc'" (1, 10) (1, 17)
318 OP '+' (1, 18) (1, 19)
319 STRING "Br'abc'" (1, 20) (1, 27)
320 OP '+' (1, 28) (1, 29)
321 STRING "BR'abc'" (1, 30) (1, 37)
322 >>> dump_tokens('br"abc" + bR"abc" + Br"abc" + BR"abc"')
323 ENCODING 'utf-8' (0, 0) (0, 0)
324 STRING 'br"abc"' (1, 0) (1, 7)
325 OP '+' (1, 8) (1, 9)
326 STRING 'bR"abc"' (1, 10) (1, 17)
327 OP '+' (1, 18) (1, 19)
328 STRING 'Br"abc"' (1, 20) (1, 27)
329 OP '+' (1, 28) (1, 29)
330 STRING 'BR"abc"' (1, 30) (1, 37)
331 >>> dump_tokens("rb'abc' + rB'abc' + Rb'abc' + RB'abc'")
332 ENCODING 'utf-8' (0, 0) (0, 0)
333 STRING "rb'abc'" (1, 0) (1, 7)
334 OP '+' (1, 8) (1, 9)
335 STRING "rB'abc'" (1, 10) (1, 17)
336 OP '+' (1, 18) (1, 19)
337 STRING "Rb'abc'" (1, 20) (1, 27)
338 OP '+' (1, 28) (1, 29)
339 STRING "RB'abc'" (1, 30) (1, 37)
340 >>> dump_tokens('rb"abc" + rB"abc" + Rb"abc" + RB"abc"')
341 ENCODING 'utf-8' (0, 0) (0, 0)
342 STRING 'rb"abc"' (1, 0) (1, 7)
343 OP '+' (1, 8) (1, 9)
344 STRING 'rB"abc"' (1, 10) (1, 17)
345 OP '+' (1, 18) (1, 19)
346 STRING 'Rb"abc"' (1, 20) (1, 27)
347 OP '+' (1, 28) (1, 29)
348 STRING 'RB"abc"' (1, 30) (1, 37)
349
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000350Operators
351
352 >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000353 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000354 NAME 'def' (1, 0) (1, 3)
355 NAME 'd22' (1, 4) (1, 7)
356 OP '(' (1, 7) (1, 8)
357 NAME 'a' (1, 8) (1, 9)
358 OP ',' (1, 9) (1, 10)
359 NAME 'b' (1, 11) (1, 12)
360 OP ',' (1, 12) (1, 13)
361 NAME 'c' (1, 14) (1, 15)
362 OP '=' (1, 15) (1, 16)
363 NUMBER '2' (1, 16) (1, 17)
364 OP ',' (1, 17) (1, 18)
365 NAME 'd' (1, 19) (1, 20)
366 OP '=' (1, 20) (1, 21)
367 NUMBER '2' (1, 21) (1, 22)
368 OP ',' (1, 22) (1, 23)
369 OP '*' (1, 24) (1, 25)
370 NAME 'k' (1, 25) (1, 26)
371 OP ')' (1, 26) (1, 27)
372 OP ':' (1, 27) (1, 28)
373 NAME 'pass' (1, 29) (1, 33)
374 >>> dump_tokens("def d01v_(a=1, *k, **w): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000375 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000376 NAME 'def' (1, 0) (1, 3)
377 NAME 'd01v_' (1, 4) (1, 9)
378 OP '(' (1, 9) (1, 10)
379 NAME 'a' (1, 10) (1, 11)
380 OP '=' (1, 11) (1, 12)
381 NUMBER '1' (1, 12) (1, 13)
382 OP ',' (1, 13) (1, 14)
383 OP '*' (1, 15) (1, 16)
384 NAME 'k' (1, 16) (1, 17)
385 OP ',' (1, 17) (1, 18)
386 OP '**' (1, 19) (1, 21)
387 NAME 'w' (1, 21) (1, 22)
388 OP ')' (1, 22) (1, 23)
389 OP ':' (1, 23) (1, 24)
390 NAME 'pass' (1, 25) (1, 29)
391
392Comparison
393
394 >>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +
395 ... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")
Trent Nelson428de652008-03-18 22:41:35 +0000396 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000397 NAME 'if' (1, 0) (1, 2)
398 NUMBER '1' (1, 3) (1, 4)
399 OP '<' (1, 5) (1, 6)
400 NUMBER '1' (1, 7) (1, 8)
401 OP '>' (1, 9) (1, 10)
402 NUMBER '1' (1, 11) (1, 12)
403 OP '==' (1, 13) (1, 15)
404 NUMBER '1' (1, 16) (1, 17)
405 OP '>=' (1, 18) (1, 20)
406 NUMBER '5' (1, 21) (1, 22)
407 OP '<=' (1, 23) (1, 25)
408 NUMBER '0x15' (1, 26) (1, 30)
409 OP '<=' (1, 31) (1, 33)
410 NUMBER '0x12' (1, 34) (1, 38)
411 OP '!=' (1, 39) (1, 41)
412 NUMBER '1' (1, 42) (1, 43)
413 NAME 'and' (1, 44) (1, 47)
414 NUMBER '5' (1, 48) (1, 49)
415 NAME 'in' (1, 50) (1, 52)
416 NUMBER '1' (1, 53) (1, 54)
417 NAME 'not' (1, 55) (1, 58)
418 NAME 'in' (1, 59) (1, 61)
419 NUMBER '1' (1, 62) (1, 63)
420 NAME 'is' (1, 64) (1, 66)
421 NUMBER '1' (1, 67) (1, 68)
422 NAME 'or' (1, 69) (1, 71)
423 NUMBER '5' (1, 72) (1, 73)
424 NAME 'is' (1, 74) (1, 76)
425 NAME 'not' (1, 77) (1, 80)
426 NUMBER '1' (1, 81) (1, 82)
427 OP ':' (1, 82) (1, 83)
428 NAME 'pass' (1, 84) (1, 88)
429
430Shift
431
432 >>> dump_tokens("x = 1 << 1 >> 5")
Trent Nelson428de652008-03-18 22:41:35 +0000433 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000434 NAME 'x' (1, 0) (1, 1)
435 OP '=' (1, 2) (1, 3)
436 NUMBER '1' (1, 4) (1, 5)
437 OP '<<' (1, 6) (1, 8)
438 NUMBER '1' (1, 9) (1, 10)
439 OP '>>' (1, 11) (1, 13)
440 NUMBER '5' (1, 14) (1, 15)
441
442Additive
443
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000444 >>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")
Trent Nelson428de652008-03-18 22:41:35 +0000445 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000446 NAME 'x' (1, 0) (1, 1)
447 OP '=' (1, 2) (1, 3)
448 NUMBER '1' (1, 4) (1, 5)
449 OP '-' (1, 6) (1, 7)
450 NAME 'y' (1, 8) (1, 9)
451 OP '+' (1, 10) (1, 11)
452 NUMBER '15' (1, 12) (1, 14)
453 OP '-' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000454 NUMBER '1' (1, 17) (1, 18)
455 OP '+' (1, 19) (1, 20)
456 NUMBER '0x124' (1, 21) (1, 26)
457 OP '+' (1, 27) (1, 28)
458 NAME 'z' (1, 29) (1, 30)
459 OP '+' (1, 31) (1, 32)
460 NAME 'a' (1, 33) (1, 34)
461 OP '[' (1, 34) (1, 35)
462 NUMBER '5' (1, 35) (1, 36)
463 OP ']' (1, 36) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000464
465Multiplicative
466
467 >>> dump_tokens("x = 1//1*1/5*12%0x12")
Trent Nelson428de652008-03-18 22:41:35 +0000468 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000469 NAME 'x' (1, 0) (1, 1)
470 OP '=' (1, 2) (1, 3)
471 NUMBER '1' (1, 4) (1, 5)
472 OP '//' (1, 5) (1, 7)
473 NUMBER '1' (1, 7) (1, 8)
474 OP '*' (1, 8) (1, 9)
475 NUMBER '1' (1, 9) (1, 10)
476 OP '/' (1, 10) (1, 11)
477 NUMBER '5' (1, 11) (1, 12)
478 OP '*' (1, 12) (1, 13)
479 NUMBER '12' (1, 13) (1, 15)
480 OP '%' (1, 15) (1, 16)
481 NUMBER '0x12' (1, 16) (1, 20)
482
483Unary
484
485 >>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")
Trent Nelson428de652008-03-18 22:41:35 +0000486 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000487 OP '~' (1, 0) (1, 1)
488 NUMBER '1' (1, 1) (1, 2)
489 OP '^' (1, 3) (1, 4)
490 NUMBER '1' (1, 5) (1, 6)
491 OP '&' (1, 7) (1, 8)
492 NUMBER '1' (1, 9) (1, 10)
493 OP '|' (1, 11) (1, 12)
494 NUMBER '1' (1, 12) (1, 13)
495 OP '^' (1, 14) (1, 15)
496 OP '-' (1, 16) (1, 17)
497 NUMBER '1' (1, 17) (1, 18)
498 >>> dump_tokens("-1*1/1+1*1//1 - ---1**1")
Trent Nelson428de652008-03-18 22:41:35 +0000499 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000500 OP '-' (1, 0) (1, 1)
501 NUMBER '1' (1, 1) (1, 2)
502 OP '*' (1, 2) (1, 3)
503 NUMBER '1' (1, 3) (1, 4)
504 OP '/' (1, 4) (1, 5)
505 NUMBER '1' (1, 5) (1, 6)
506 OP '+' (1, 6) (1, 7)
507 NUMBER '1' (1, 7) (1, 8)
508 OP '*' (1, 8) (1, 9)
509 NUMBER '1' (1, 9) (1, 10)
510 OP '//' (1, 10) (1, 12)
511 NUMBER '1' (1, 12) (1, 13)
512 OP '-' (1, 14) (1, 15)
513 OP '-' (1, 16) (1, 17)
514 OP '-' (1, 17) (1, 18)
515 OP '-' (1, 18) (1, 19)
516 NUMBER '1' (1, 19) (1, 20)
517 OP '**' (1, 20) (1, 22)
518 NUMBER '1' (1, 22) (1, 23)
519
520Selector
521
522 >>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")
Trent Nelson428de652008-03-18 22:41:35 +0000523 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000524 NAME 'import' (1, 0) (1, 6)
525 NAME 'sys' (1, 7) (1, 10)
526 OP ',' (1, 10) (1, 11)
527 NAME 'time' (1, 12) (1, 16)
528 NEWLINE '\\n' (1, 16) (1, 17)
529 NAME 'x' (2, 0) (2, 1)
530 OP '=' (2, 2) (2, 3)
531 NAME 'sys' (2, 4) (2, 7)
532 OP '.' (2, 7) (2, 8)
533 NAME 'modules' (2, 8) (2, 15)
534 OP '[' (2, 15) (2, 16)
535 STRING "'time'" (2, 16) (2, 22)
536 OP ']' (2, 22) (2, 23)
537 OP '.' (2, 23) (2, 24)
538 NAME 'time' (2, 24) (2, 28)
539 OP '(' (2, 28) (2, 29)
540 OP ')' (2, 29) (2, 30)
541
542Methods
543
544 >>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000545 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000546 OP '@' (1, 0) (1, 1)
547 NAME 'staticmethod (1, 1) (1, 13)
548 NEWLINE '\\n' (1, 13) (1, 14)
549 NAME 'def' (2, 0) (2, 3)
550 NAME 'foo' (2, 4) (2, 7)
551 OP '(' (2, 7) (2, 8)
552 NAME 'x' (2, 8) (2, 9)
553 OP ',' (2, 9) (2, 10)
554 NAME 'y' (2, 10) (2, 11)
555 OP ')' (2, 11) (2, 12)
556 OP ':' (2, 12) (2, 13)
557 NAME 'pass' (2, 14) (2, 18)
558
559Backslash means line continuation, except for comments
560
561 >>> roundtrip("x=1+\\\\n"
562 ... "1\\n"
563 ... "# This is a comment\\\\n"
564 ... "# This also\\n")
565 True
566 >>> roundtrip("# Comment \\\\nx = 0")
567 True
Christian Heimesba4af492008-03-28 00:55:15 +0000568
569Two string literals on the same line
570
571 >>> roundtrip("'' ''")
572 True
573
574Test roundtrip on random python modules.
Antoine Pitrou5bc4fa72010-10-14 15:34:31 +0000575pass the '-ucpu' option to process the full directory.
Christian Heimesba4af492008-03-28 00:55:15 +0000576
577 >>> import random
578 >>> tempdir = os.path.dirname(f) or os.curdir
579 >>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
580
Benjamin Peterson963e4022011-08-13 00:33:21 -0500581tokenize is broken on test_pep3131.py because regular expressions are broken on
582the obscure unicode identifiers in it. *sigh*
583 >>> testfiles.remove(os.path.join(tempdir, "test_pep3131.py"))
Antoine Pitrou5bc4fa72010-10-14 15:34:31 +0000584 >>> if not support.is_resource_enabled("cpu"):
Christian Heimesba4af492008-03-28 00:55:15 +0000585 ... testfiles = random.sample(testfiles, 10)
586 ...
587 >>> for testfile in testfiles:
588 ... if not roundtrip(open(testfile, 'rb')):
589 ... print("Roundtrip failed for file %s" % testfile)
590 ... break
591 ... else: True
592 True
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000593
594Evil tabs
Benjamin Peterson33856de2010-08-30 14:41:20 +0000595
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000596 >>> dump_tokens("def f():\\n\\tif x\\n \\tpass")
597 ENCODING 'utf-8' (0, 0) (0, 0)
598 NAME 'def' (1, 0) (1, 3)
599 NAME 'f' (1, 4) (1, 5)
600 OP '(' (1, 5) (1, 6)
601 OP ')' (1, 6) (1, 7)
602 OP ':' (1, 7) (1, 8)
603 NEWLINE '\\n' (1, 8) (1, 9)
604 INDENT '\\t' (2, 0) (2, 1)
605 NAME 'if' (2, 1) (2, 3)
606 NAME 'x' (2, 4) (2, 5)
607 NEWLINE '\\n' (2, 5) (2, 6)
608 INDENT ' \\t' (3, 0) (3, 9)
609 NAME 'pass' (3, 9) (3, 13)
610 DEDENT '' (4, 0) (4, 0)
611 DEDENT '' (4, 0) (4, 0)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000612
613Non-ascii identifiers
614
615 >>> dump_tokens("Örter = 'places'\\ngrün = 'green'")
616 ENCODING 'utf-8' (0, 0) (0, 0)
617 NAME 'Örter' (1, 0) (1, 5)
618 OP '=' (1, 6) (1, 7)
619 STRING "'places'" (1, 8) (1, 16)
620 NEWLINE '\\n' (1, 16) (1, 17)
621 NAME 'grün' (2, 0) (2, 4)
622 OP '=' (2, 5) (2, 6)
623 STRING "'green'" (2, 7) (2, 14)
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000624
625Legacy unicode literals:
626
Christian Heimes0b3847d2012-06-20 11:17:58 +0200627 >>> dump_tokens("Örter = u'places'\\ngrün = U'green'")
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000628 ENCODING 'utf-8' (0, 0) (0, 0)
629 NAME 'Örter' (1, 0) (1, 5)
630 OP '=' (1, 6) (1, 7)
631 STRING "u'places'" (1, 8) (1, 17)
632 NEWLINE '\\n' (1, 17) (1, 18)
633 NAME 'grün' (2, 0) (2, 4)
634 OP '=' (2, 5) (2, 6)
Christian Heimes0b3847d2012-06-20 11:17:58 +0200635 STRING "U'green'" (2, 7) (2, 15)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000636"""
637
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000638from test import support
Trent Nelson428de652008-03-18 22:41:35 +0000639from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
Meador Inge00c7f852012-01-19 00:44:45 -0600640 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
Victor Stinner58c07522010-11-09 01:08:59 +0000641 open as tokenize_open)
Trent Nelson428de652008-03-18 22:41:35 +0000642from io import BytesIO
643from unittest import TestCase
644import os, sys, glob
Meador Inge00c7f852012-01-19 00:44:45 -0600645import token
Raymond Hettinger68c04532005-06-10 11:05:19 +0000646
Thomas Wouters89f507f2006-12-13 04:49:30 +0000647def dump_tokens(s):
648 """Print out the tokens in s in a table format.
649
650 The ENDMARKER is omitted.
651 """
Trent Nelson428de652008-03-18 22:41:35 +0000652 f = BytesIO(s.encode('utf-8'))
653 for type, token, start, end, line in tokenize(f.readline):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000654 if type == ENDMARKER:
655 break
656 type = tok_name[type]
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000657 print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters89f507f2006-12-13 04:49:30 +0000658
Trent Nelson428de652008-03-18 22:41:35 +0000659def roundtrip(f):
660 """
661 Test roundtrip for `untokenize`. `f` is an open file or a string.
662 The source code in f is tokenized, converted back to source code via
663 tokenize.untokenize(), and tokenized again from the latter. The test
664 fails if the second tokenization doesn't match the first.
665 """
666 if isinstance(f, str):
667 f = BytesIO(f.encode('utf-8'))
Brian Curtin9f5f65c2010-10-30 21:35:28 +0000668 try:
669 token_list = list(tokenize(f.readline))
670 finally:
671 f.close()
Trent Nelson428de652008-03-18 22:41:35 +0000672 tokens1 = [tok[:2] for tok in token_list]
673 new_bytes = untokenize(tokens1)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300674 readline = (line for line in new_bytes.splitlines(keepends=True)).__next__
Trent Nelson428de652008-03-18 22:41:35 +0000675 tokens2 = [tok[:2] for tok in tokenize(readline)]
676 return tokens1 == tokens2
Thomas Wouters89f507f2006-12-13 04:49:30 +0000677
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000678# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000679def decistmt(s):
680 """Substitute Decimals for floats in a string of statements.
681
682 >>> from decimal import Decimal
Georg Brandl88fc6642007-02-09 21:28:07 +0000683 >>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000684 >>> decistmt(s)
Georg Brandl88fc6642007-02-09 21:28:07 +0000685 "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger68c04532005-06-10 11:05:19 +0000686
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000687 The format of the exponent is inherited from the platform C library.
688 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
Mark Dickinson388122d2010-08-04 20:56:28 +0000689 we're only showing 11 digits, and the 12th isn't close to 5, the
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000690 rest of the output should be platform-independent.
691
692 >>> exec(s) #doctest: +ELLIPSIS
Mark Dickinson388122d2010-08-04 20:56:28 +0000693 -3.2171603427...e-0...7
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000694
695 Output from calculations with Decimal should be identical across all
696 platforms.
697
Raymond Hettinger68c04532005-06-10 11:05:19 +0000698 >>> exec(decistmt(s))
699 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +0000700 """
701 result = []
Trent Nelson428de652008-03-18 22:41:35 +0000702 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
Raymond Hettinger68c04532005-06-10 11:05:19 +0000703 for toknum, tokval, _, _, _ in g:
704 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
705 result.extend([
706 (NAME, 'Decimal'),
707 (OP, '('),
708 (STRING, repr(tokval)),
709 (OP, ')')
710 ])
711 else:
712 result.append((toknum, tokval))
Trent Nelson428de652008-03-18 22:41:35 +0000713 return untokenize(result).decode('utf-8')
714
715
716class TestTokenizerAdheresToPep0263(TestCase):
717 """
718 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
719 """
720
721 def _testFile(self, filename):
722 path = os.path.join(os.path.dirname(__file__), filename)
723 return roundtrip(open(path, 'rb'))
724
725 def test_utf8_coding_cookie_and_no_utf8_bom(self):
Ned Deily2ea6fcc2011-07-19 16:15:27 -0700726 f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
Trent Nelson428de652008-03-18 22:41:35 +0000727 self.assertTrue(self._testFile(f))
728
729 def test_latin1_coding_cookie_and_utf8_bom(self):
730 """
731 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
732 allowed encoding for the comment is 'utf-8'. The text file used in
733 this test starts with a BOM signature, but specifies latin1 as the
734 coding, so verify that a SyntaxError is raised, which matches the
735 behaviour of the interpreter when it encounters a similar condition.
736 """
737 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000738 self.assertRaises(SyntaxError, self._testFile, f)
Trent Nelson428de652008-03-18 22:41:35 +0000739
740 def test_no_coding_cookie_and_utf8_bom(self):
741 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
742 self.assertTrue(self._testFile(f))
743
744 def test_utf8_coding_cookie_and_utf8_bom(self):
745 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
746 self.assertTrue(self._testFile(f))
747
748
749class Test_Tokenize(TestCase):
750
751 def test__tokenize_decodes_with_specified_encoding(self):
752 literal = '"ЉЊЈЁЂ"'
753 line = literal.encode('utf-8')
754 first = False
755 def readline():
756 nonlocal first
757 if not first:
758 first = True
759 return line
760 else:
761 return b''
762
763 # skip the initial encoding token and the end token
764 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
765 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000766 self.assertEqual(tokens, expected_tokens,
767 "bytes not decoded with encoding")
Trent Nelson428de652008-03-18 22:41:35 +0000768
769 def test__tokenize_does_not_decode_with_encoding_none(self):
770 literal = '"ЉЊЈЁЂ"'
771 first = False
772 def readline():
773 nonlocal first
774 if not first:
775 first = True
776 return literal
777 else:
778 return b''
779
780 # skip the end token
781 tokens = list(_tokenize(readline, encoding=None))[:-1]
782 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000783 self.assertEqual(tokens, expected_tokens,
784 "string not tokenized when encoding is None")
Trent Nelson428de652008-03-18 22:41:35 +0000785
786
787class TestDetectEncoding(TestCase):
788
789 def get_readline(self, lines):
790 index = 0
791 def readline():
792 nonlocal index
793 if index == len(lines):
794 raise StopIteration
795 line = lines[index]
796 index += 1
797 return line
798 return readline
799
800 def test_no_bom_no_encoding_cookie(self):
801 lines = (
802 b'# something\n',
803 b'print(something)\n',
804 b'do_something(else)\n'
805 )
806 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000807 self.assertEqual(encoding, 'utf-8')
808 self.assertEqual(consumed_lines, list(lines[:2]))
Trent Nelson428de652008-03-18 22:41:35 +0000809
810 def test_bom_no_cookie(self):
811 lines = (
812 b'\xef\xbb\xbf# something\n',
813 b'print(something)\n',
814 b'do_something(else)\n'
815 )
816 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000817 self.assertEqual(encoding, 'utf-8-sig')
818 self.assertEqual(consumed_lines,
819 [b'# something\n', b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000820
821 def test_cookie_first_line_no_bom(self):
822 lines = (
823 b'# -*- coding: latin-1 -*-\n',
824 b'print(something)\n',
825 b'do_something(else)\n'
826 )
827 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000828 self.assertEqual(encoding, 'iso-8859-1')
829 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000830
831 def test_matched_bom_and_cookie_first_line(self):
832 lines = (
833 b'\xef\xbb\xbf# coding=utf-8\n',
834 b'print(something)\n',
835 b'do_something(else)\n'
836 )
837 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000838 self.assertEqual(encoding, 'utf-8-sig')
839 self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000840
841 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
842 lines = (
843 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
844 b'print(something)\n',
845 b'do_something(else)\n'
846 )
847 readline = self.get_readline(lines)
848 self.assertRaises(SyntaxError, detect_encoding, readline)
849
850 def test_cookie_second_line_no_bom(self):
851 lines = (
852 b'#! something\n',
853 b'# vim: set fileencoding=ascii :\n',
854 b'print(something)\n',
855 b'do_something(else)\n'
856 )
857 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000858 self.assertEqual(encoding, 'ascii')
Trent Nelson428de652008-03-18 22:41:35 +0000859 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
Ezio Melottib3aedd42010-11-20 19:04:17 +0000860 self.assertEqual(consumed_lines, expected)
Trent Nelson428de652008-03-18 22:41:35 +0000861
862 def test_matched_bom_and_cookie_second_line(self):
863 lines = (
864 b'\xef\xbb\xbf#! something\n',
865 b'f# coding=utf-8\n',
866 b'print(something)\n',
867 b'do_something(else)\n'
868 )
869 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000870 self.assertEqual(encoding, 'utf-8-sig')
871 self.assertEqual(consumed_lines,
872 [b'#! something\n', b'f# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000873
874 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
875 lines = (
876 b'\xef\xbb\xbf#! something\n',
877 b'# vim: set fileencoding=ascii :\n',
878 b'print(something)\n',
879 b'do_something(else)\n'
880 )
881 readline = self.get_readline(lines)
882 self.assertRaises(SyntaxError, detect_encoding, readline)
883
Benjamin Petersond3afada2009-10-09 21:43:09 +0000884 def test_latin1_normalization(self):
885 # See get_normal_name() in tokenizer.c.
886 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
887 "iso-8859-1-unix", "iso-latin-1-mac")
888 for encoding in encodings:
889 for rep in ("-", "_"):
890 enc = encoding.replace("-", rep)
891 lines = (b"#!/usr/bin/python\n",
892 b"# coding: " + enc.encode("ascii") + b"\n",
893 b"print(things)\n",
894 b"do_something += 4\n")
895 rl = self.get_readline(lines)
896 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000897 self.assertEqual(found, "iso-8859-1")
Benjamin Petersond3afada2009-10-09 21:43:09 +0000898
Martin v. Löwis63674f42012-04-20 14:36:47 +0200899 def test_syntaxerror_latin1(self):
900 # Issue 14629: need to raise SyntaxError if the first
901 # line(s) have non-UTF-8 characters
902 lines = (
903 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
904 )
905 readline = self.get_readline(lines)
906 self.assertRaises(SyntaxError, detect_encoding, readline)
907
908
Benjamin Petersond3afada2009-10-09 21:43:09 +0000909 def test_utf8_normalization(self):
910 # See get_normal_name() in tokenizer.c.
911 encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
912 for encoding in encodings:
913 for rep in ("-", "_"):
914 enc = encoding.replace("-", rep)
915 lines = (b"#!/usr/bin/python\n",
916 b"# coding: " + enc.encode("ascii") + b"\n",
917 b"1 + 3\n")
918 rl = self.get_readline(lines)
919 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000920 self.assertEqual(found, "utf-8")
Benjamin Petersond3afada2009-10-09 21:43:09 +0000921
Trent Nelson428de652008-03-18 22:41:35 +0000922 def test_short_files(self):
923 readline = self.get_readline((b'print(something)\n',))
924 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000925 self.assertEqual(encoding, 'utf-8')
926 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000927
928 encoding, consumed_lines = detect_encoding(self.get_readline(()))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000929 self.assertEqual(encoding, 'utf-8')
930 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +0000931
932 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
933 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000934 self.assertEqual(encoding, 'utf-8-sig')
935 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000936
937 readline = self.get_readline((b'\xef\xbb\xbf',))
938 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000939 self.assertEqual(encoding, 'utf-8-sig')
940 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +0000941
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000942 readline = self.get_readline((b'# coding: bad\n',))
943 self.assertRaises(SyntaxError, detect_encoding, readline)
Trent Nelson428de652008-03-18 22:41:35 +0000944
Victor Stinner58c07522010-11-09 01:08:59 +0000945 def test_open(self):
946 filename = support.TESTFN + '.py'
947 self.addCleanup(support.unlink, filename)
948
949 # test coding cookie
950 for encoding in ('iso-8859-15', 'utf-8'):
951 with open(filename, 'w', encoding=encoding) as fp:
952 print("# coding: %s" % encoding, file=fp)
953 print("print('euro:\u20ac')", file=fp)
954 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +0000955 self.assertEqual(fp.encoding, encoding)
956 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +0000957
958 # test BOM (no coding cookie)
959 with open(filename, 'w', encoding='utf-8-sig') as fp:
960 print("print('euro:\u20ac')", file=fp)
961 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +0000962 self.assertEqual(fp.encoding, 'utf-8-sig')
963 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +0000964
Brett Cannonc33f3f22012-04-20 13:23:54 -0400965 def test_filename_in_exception(self):
966 # When possible, include the file name in the exception.
967 path = 'some_file_path'
968 lines = (
969 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
970 )
971 class Bunk:
972 def __init__(self, lines, path):
973 self.name = path
974 self._lines = lines
975 self._index = 0
976
977 def readline(self):
978 if self._index == len(lines):
979 raise StopIteration
980 line = lines[self._index]
981 self._index += 1
982 return line
983
984 with self.assertRaises(SyntaxError):
985 ins = Bunk(lines, path)
986 # Make sure lacking a name isn't an issue.
987 del ins.name
988 detect_encoding(ins.readline)
989 with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
990 ins = Bunk(lines, path)
991 detect_encoding(ins.readline)
992
993
Trent Nelson428de652008-03-18 22:41:35 +0000994class TestTokenize(TestCase):
995
996 def test_tokenize(self):
997 import tokenize as tokenize_module
998 encoding = object()
999 encoding_used = None
1000 def mock_detect_encoding(readline):
1001 return encoding, ['first', 'second']
1002
1003 def mock__tokenize(readline, encoding):
1004 nonlocal encoding_used
1005 encoding_used = encoding
1006 out = []
1007 while True:
1008 next_line = readline()
1009 if next_line:
1010 out.append(next_line)
1011 continue
1012 return out
1013
1014 counter = 0
1015 def mock_readline():
1016 nonlocal counter
1017 counter += 1
1018 if counter == 5:
1019 return b''
1020 return counter
1021
1022 orig_detect_encoding = tokenize_module.detect_encoding
1023 orig__tokenize = tokenize_module._tokenize
1024 tokenize_module.detect_encoding = mock_detect_encoding
1025 tokenize_module._tokenize = mock__tokenize
1026 try:
1027 results = tokenize(mock_readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001028 self.assertEqual(list(results), ['first', 'second', 1, 2, 3, 4])
Trent Nelson428de652008-03-18 22:41:35 +00001029 finally:
1030 tokenize_module.detect_encoding = orig_detect_encoding
1031 tokenize_module._tokenize = orig__tokenize
1032
1033 self.assertTrue(encoding_used, encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +00001034
Meador Inge00c7f852012-01-19 00:44:45 -06001035 def assertExactTypeEqual(self, opstr, *optypes):
1036 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
1037 num_optypes = len(optypes)
1038 self.assertEqual(len(tokens), 2 + num_optypes)
1039 self.assertEqual(token.tok_name[tokens[0].exact_type],
1040 token.tok_name[ENCODING])
1041 for i in range(num_optypes):
1042 self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
1043 token.tok_name[optypes[i]])
1044 self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
1045 token.tok_name[token.ENDMARKER])
1046
1047 def test_exact_type(self):
1048 self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
1049 self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
1050 self.assertExactTypeEqual(':', token.COLON)
1051 self.assertExactTypeEqual(',', token.COMMA)
1052 self.assertExactTypeEqual(';', token.SEMI)
1053 self.assertExactTypeEqual('+', token.PLUS)
1054 self.assertExactTypeEqual('-', token.MINUS)
1055 self.assertExactTypeEqual('*', token.STAR)
1056 self.assertExactTypeEqual('/', token.SLASH)
1057 self.assertExactTypeEqual('|', token.VBAR)
1058 self.assertExactTypeEqual('&', token.AMPER)
1059 self.assertExactTypeEqual('<', token.LESS)
1060 self.assertExactTypeEqual('>', token.GREATER)
1061 self.assertExactTypeEqual('=', token.EQUAL)
1062 self.assertExactTypeEqual('.', token.DOT)
1063 self.assertExactTypeEqual('%', token.PERCENT)
1064 self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
1065 self.assertExactTypeEqual('==', token.EQEQUAL)
1066 self.assertExactTypeEqual('!=', token.NOTEQUAL)
1067 self.assertExactTypeEqual('<=', token.LESSEQUAL)
1068 self.assertExactTypeEqual('>=', token.GREATEREQUAL)
1069 self.assertExactTypeEqual('~', token.TILDE)
1070 self.assertExactTypeEqual('^', token.CIRCUMFLEX)
1071 self.assertExactTypeEqual('<<', token.LEFTSHIFT)
1072 self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
1073 self.assertExactTypeEqual('**', token.DOUBLESTAR)
1074 self.assertExactTypeEqual('+=', token.PLUSEQUAL)
1075 self.assertExactTypeEqual('-=', token.MINEQUAL)
1076 self.assertExactTypeEqual('*=', token.STAREQUAL)
1077 self.assertExactTypeEqual('/=', token.SLASHEQUAL)
1078 self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
1079 self.assertExactTypeEqual('&=', token.AMPEREQUAL)
1080 self.assertExactTypeEqual('|=', token.VBAREQUAL)
1081 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1082 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1083 self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
1084 self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
1085 self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
1086 self.assertExactTypeEqual('//', token.DOUBLESLASH)
1087 self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
1088 self.assertExactTypeEqual('@', token.AT)
1089
1090 self.assertExactTypeEqual('a**2+b**2==c**2',
1091 NAME, token.DOUBLESTAR, NUMBER,
1092 token.PLUS,
1093 NAME, token.DOUBLESTAR, NUMBER,
1094 token.EQEQUAL,
1095 NAME, token.DOUBLESTAR, NUMBER)
1096 self.assertExactTypeEqual('{1, 2, 3}',
1097 token.LBRACE,
1098 token.NUMBER, token.COMMA,
1099 token.NUMBER, token.COMMA,
1100 token.NUMBER,
1101 token.RBRACE)
1102 self.assertExactTypeEqual('^(x & 0x1)',
1103 token.CIRCUMFLEX,
1104 token.LPAR,
1105 token.NAME, token.AMPER, token.NUMBER,
1106 token.RPAR)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001107
1108__test__ = {"doctests" : doctests, 'decistmt': decistmt}
1109
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001110def test_main():
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001111 from test import test_tokenize
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001112 support.run_doctest(test_tokenize, True)
1113 support.run_unittest(TestTokenizerAdheresToPep0263)
1114 support.run_unittest(Test_Tokenize)
1115 support.run_unittest(TestDetectEncoding)
1116 support.run_unittest(TestTokenize)
Neal Norwitzc1505362006-12-28 06:47:50 +00001117
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001118if __name__ == "__main__":
1119 test_main()