blob: 4c2e4e2b6778d2d827e4f8cb40983190bac963b6 [file] [log] [blame]
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001doctests = """
2Tests for the tokenize module.
Thomas Wouters89f507f2006-12-13 04:49:30 +00003
Christian Heimesdd15f6c2008-03-16 00:07:10 +00004The tests can be really simple. Given a small fragment of source
Eric Smith74ca5572008-03-17 19:49:19 +00005code, print out a table with tokens. The ENDMARK is omitted for
Thomas Wouters89f507f2006-12-13 04:49:30 +00006brevity.
7
Christian Heimesdd15f6c2008-03-16 00:07:10 +00008 >>> dump_tokens("1 + 1")
Trent Nelson428de652008-03-18 22:41:35 +00009 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010 NUMBER '1' (1, 0) (1, 1)
11 OP '+' (1, 2) (1, 3)
12 NUMBER '1' (1, 4) (1, 5)
Thomas Wouters89f507f2006-12-13 04:49:30 +000013
Christian Heimesdd15f6c2008-03-16 00:07:10 +000014 >>> dump_tokens("if False:\\n"
15 ... " # NL\\n"
16 ... " True = False # NEWLINE\\n")
Trent Nelson428de652008-03-18 22:41:35 +000017 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000018 NAME 'if' (1, 0) (1, 2)
19 NAME 'False' (1, 3) (1, 8)
20 OP ':' (1, 8) (1, 9)
21 NEWLINE '\\n' (1, 9) (1, 10)
22 COMMENT '# NL' (2, 4) (2, 8)
23 NL '\\n' (2, 8) (2, 9)
24 INDENT ' ' (3, 0) (3, 4)
25 NAME 'True' (3, 4) (3, 8)
26 OP '=' (3, 9) (3, 10)
27 NAME 'False' (3, 11) (3, 16)
28 COMMENT '# NEWLINE' (3, 17) (3, 26)
29 NEWLINE '\\n' (3, 26) (3, 27)
30 DEDENT '' (4, 0) (4, 0)
Thomas Wouters89f507f2006-12-13 04:49:30 +000031
Christian Heimesdd15f6c2008-03-16 00:07:10 +000032 >>> indent_error_file = \"""
33 ... def k(x):
34 ... x += 2
35 ... x += 5
36 ... \"""
Trent Nelson428de652008-03-18 22:41:35 +000037 >>> readline = BytesIO(indent_error_file.encode('utf-8')).readline
38 >>> for tok in tokenize(readline): pass
Christian Heimesdd15f6c2008-03-16 00:07:10 +000039 Traceback (most recent call last):
40 ...
41 IndentationError: unindent does not match any outer indentation level
Thomas Wouters89f507f2006-12-13 04:49:30 +000042
Mark Dickinson3c0b3172010-06-29 07:38:37 +000043There are some standard formatting practices that are easy to get right.
Thomas Wouters89f507f2006-12-13 04:49:30 +000044
Christian Heimesdd15f6c2008-03-16 00:07:10 +000045 >>> roundtrip("if x == 1:\\n"
46 ... " print(x)\\n")
47 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000048
Christian Heimesdd15f6c2008-03-16 00:07:10 +000049 >>> roundtrip("# This is a comment\\n# This also")
50 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000051
52Some people use different formatting conventions, which makes
Christian Heimesdd15f6c2008-03-16 00:07:10 +000053untokenize a little trickier. Note that this test involves trailing
54whitespace after the colon. Note that we use hex escapes to make the
Trent Nelson428de652008-03-18 22:41:35 +000055two trailing blanks apparent in the expected output.
Thomas Wouters89f507f2006-12-13 04:49:30 +000056
Christian Heimesdd15f6c2008-03-16 00:07:10 +000057 >>> roundtrip("if x == 1 : \\n"
58 ... " print(x)\\n")
59 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000060
Benjamin Petersonee8712c2008-05-20 21:35:26 +000061 >>> f = support.findfile("tokenize_tests.txt")
Trent Nelson428de652008-03-18 22:41:35 +000062 >>> roundtrip(open(f, 'rb'))
Christian Heimesdd15f6c2008-03-16 00:07:10 +000063 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000064
Christian Heimesdd15f6c2008-03-16 00:07:10 +000065 >>> roundtrip("if x == 1:\\n"
66 ... " # A comment by itself.\\n"
67 ... " print(x) # Comment here, too.\\n"
68 ... " # Another comment.\\n"
69 ... "after_if = True\\n")
70 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000071
Christian Heimesdd15f6c2008-03-16 00:07:10 +000072 >>> roundtrip("if (x # The comments need to go in the right place\\n"
73 ... " == 1):\\n"
74 ... " print('x==1')\\n")
75 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000076
Christian Heimesdd15f6c2008-03-16 00:07:10 +000077 >>> roundtrip("class Test: # A comment here\\n"
78 ... " # A comment with weird indent\\n"
79 ... " after_com = 5\\n"
80 ... " def x(m): return m*5 # a one liner\\n"
81 ... " def y(m): # A whitespace after the colon\\n"
82 ... " return y*4 # 3-space indent\\n")
83 True
84
85Some error-handling code
86
87 >>> roundtrip("try: import somemodule\\n"
88 ... "except ImportError: # comment\\n"
Christian Heimesba4af492008-03-28 00:55:15 +000089 ... " print('Can not import' # comment2\\n)"
Neal Norwitz752abd02008-05-13 04:55:24 +000090 ... "else: print('Loaded')\\n")
Christian Heimesdd15f6c2008-03-16 00:07:10 +000091 True
92
Eric Smith74ca5572008-03-17 19:49:19 +000093Balancing continuation
Christian Heimesdd15f6c2008-03-16 00:07:10 +000094
95 >>> roundtrip("a = (3,4, \\n"
96 ... "5,6)\\n"
97 ... "y = [3, 4,\\n"
98 ... "5]\\n"
99 ... "z = {'a': 5,\\n"
100 ... "'b':15, 'c':True}\\n"
101 ... "x = len(y) + 5 - a[\\n"
102 ... "3] - a[2]\\n"
103 ... "+ len(z) - z[\\n"
104 ... "'b']\\n")
105 True
106
107Ordinary integers and binary operators
108
109 >>> dump_tokens("0xff <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000110 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000111 NUMBER '0xff' (1, 0) (1, 4)
112 OP '<=' (1, 5) (1, 7)
113 NUMBER '255' (1, 8) (1, 11)
Eric Smith74ca5572008-03-17 19:49:19 +0000114 >>> dump_tokens("0b10 <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000115 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000116 NUMBER '0b10' (1, 0) (1, 4)
117 OP '<=' (1, 5) (1, 7)
118 NUMBER '255' (1, 8) (1, 11)
119 >>> dump_tokens("0o123 <= 0O123")
Trent Nelson428de652008-03-18 22:41:35 +0000120 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000121 NUMBER '0o123' (1, 0) (1, 5)
122 OP '<=' (1, 6) (1, 8)
123 NUMBER '0O123' (1, 9) (1, 14)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000124 >>> dump_tokens("1234567 > ~0x15")
Trent Nelson428de652008-03-18 22:41:35 +0000125 ENCODING 'utf-8' (0, 0) (0, 0)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000126 NUMBER '1234567' (1, 0) (1, 7)
127 OP '>' (1, 8) (1, 9)
128 OP '~' (1, 10) (1, 11)
129 NUMBER '0x15' (1, 11) (1, 15)
130 >>> dump_tokens("2134568 != 1231515")
Trent Nelson428de652008-03-18 22:41:35 +0000131 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000132 NUMBER '2134568' (1, 0) (1, 7)
133 OP '!=' (1, 8) (1, 10)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000134 NUMBER '1231515' (1, 11) (1, 18)
135 >>> dump_tokens("(-124561-1) & 200000000")
Trent Nelson428de652008-03-18 22:41:35 +0000136 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000137 OP '(' (1, 0) (1, 1)
138 OP '-' (1, 1) (1, 2)
139 NUMBER '124561' (1, 2) (1, 8)
140 OP '-' (1, 8) (1, 9)
141 NUMBER '1' (1, 9) (1, 10)
142 OP ')' (1, 10) (1, 11)
143 OP '&' (1, 12) (1, 13)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000144 NUMBER '200000000' (1, 14) (1, 23)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000145 >>> dump_tokens("0xdeadbeef != -1")
Trent Nelson428de652008-03-18 22:41:35 +0000146 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000147 NUMBER '0xdeadbeef' (1, 0) (1, 10)
148 OP '!=' (1, 11) (1, 13)
149 OP '-' (1, 14) (1, 15)
150 NUMBER '1' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000151 >>> dump_tokens("0xdeadc0de & 12345")
Trent Nelson428de652008-03-18 22:41:35 +0000152 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000153 NUMBER '0xdeadc0de' (1, 0) (1, 10)
154 OP '&' (1, 11) (1, 12)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000155 NUMBER '12345' (1, 13) (1, 18)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000156 >>> dump_tokens("0xFF & 0x15 | 1234")
Trent Nelson428de652008-03-18 22:41:35 +0000157 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000158 NUMBER '0xFF' (1, 0) (1, 4)
159 OP '&' (1, 5) (1, 6)
160 NUMBER '0x15' (1, 7) (1, 11)
161 OP '|' (1, 12) (1, 13)
162 NUMBER '1234' (1, 14) (1, 18)
163
164Long integers
165
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000166 >>> dump_tokens("x = 0")
Trent Nelson428de652008-03-18 22:41:35 +0000167 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000168 NAME 'x' (1, 0) (1, 1)
169 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000170 NUMBER '0' (1, 4) (1, 5)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000171 >>> dump_tokens("x = 0xfffffffffff")
Trent Nelson428de652008-03-18 22:41:35 +0000172 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000173 NAME 'x' (1, 0) (1, 1)
174 OP '=' (1, 2) (1, 3)
175 NUMBER '0xffffffffff (1, 4) (1, 17)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000176 >>> dump_tokens("x = 123141242151251616110")
Trent Nelson428de652008-03-18 22:41:35 +0000177 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000178 NAME 'x' (1, 0) (1, 1)
179 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000180 NUMBER '123141242151 (1, 4) (1, 25)
181 >>> dump_tokens("x = -15921590215012591")
Trent Nelson428de652008-03-18 22:41:35 +0000182 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000183 NAME 'x' (1, 0) (1, 1)
184 OP '=' (1, 2) (1, 3)
185 OP '-' (1, 4) (1, 5)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000186 NUMBER '159215902150 (1, 5) (1, 22)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000187
188Floating point numbers
189
190 >>> dump_tokens("x = 3.14159")
Trent Nelson428de652008-03-18 22:41:35 +0000191 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000192 NAME 'x' (1, 0) (1, 1)
193 OP '=' (1, 2) (1, 3)
194 NUMBER '3.14159' (1, 4) (1, 11)
195 >>> dump_tokens("x = 314159.")
Trent Nelson428de652008-03-18 22:41:35 +0000196 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000197 NAME 'x' (1, 0) (1, 1)
198 OP '=' (1, 2) (1, 3)
199 NUMBER '314159.' (1, 4) (1, 11)
200 >>> dump_tokens("x = .314159")
Trent Nelson428de652008-03-18 22:41:35 +0000201 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000202 NAME 'x' (1, 0) (1, 1)
203 OP '=' (1, 2) (1, 3)
204 NUMBER '.314159' (1, 4) (1, 11)
205 >>> dump_tokens("x = 3e14159")
Trent Nelson428de652008-03-18 22:41:35 +0000206 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000207 NAME 'x' (1, 0) (1, 1)
208 OP '=' (1, 2) (1, 3)
209 NUMBER '3e14159' (1, 4) (1, 11)
210 >>> dump_tokens("x = 3E123")
Trent Nelson428de652008-03-18 22:41:35 +0000211 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000212 NAME 'x' (1, 0) (1, 1)
213 OP '=' (1, 2) (1, 3)
214 NUMBER '3E123' (1, 4) (1, 9)
215 >>> dump_tokens("x+y = 3e-1230")
Trent Nelson428de652008-03-18 22:41:35 +0000216 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000217 NAME 'x' (1, 0) (1, 1)
218 OP '+' (1, 1) (1, 2)
219 NAME 'y' (1, 2) (1, 3)
220 OP '=' (1, 4) (1, 5)
221 NUMBER '3e-1230' (1, 6) (1, 13)
222 >>> dump_tokens("x = 3.14e159")
Trent Nelson428de652008-03-18 22:41:35 +0000223 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000224 NAME 'x' (1, 0) (1, 1)
225 OP '=' (1, 2) (1, 3)
226 NUMBER '3.14e159' (1, 4) (1, 12)
227
228String literals
229
230 >>> dump_tokens("x = ''; y = \\\"\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000231 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000232 NAME 'x' (1, 0) (1, 1)
233 OP '=' (1, 2) (1, 3)
234 STRING "''" (1, 4) (1, 6)
235 OP ';' (1, 6) (1, 7)
236 NAME 'y' (1, 8) (1, 9)
237 OP '=' (1, 10) (1, 11)
238 STRING '""' (1, 12) (1, 14)
239 >>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000240 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000241 NAME 'x' (1, 0) (1, 1)
242 OP '=' (1, 2) (1, 3)
243 STRING '\\'"\\'' (1, 4) (1, 7)
244 OP ';' (1, 7) (1, 8)
245 NAME 'y' (1, 9) (1, 10)
246 OP '=' (1, 11) (1, 12)
247 STRING '"\\'"' (1, 13) (1, 16)
248 >>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000249 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000250 NAME 'x' (1, 0) (1, 1)
251 OP '=' (1, 2) (1, 3)
252 STRING '"doesn\\'t "' (1, 4) (1, 14)
253 NAME 'shrink' (1, 14) (1, 20)
254 STRING '", does it"' (1, 20) (1, 31)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000255 >>> dump_tokens("x = 'abc' + 'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000256 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000257 NAME 'x' (1, 0) (1, 1)
258 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000259 STRING "'abc'" (1, 4) (1, 9)
260 OP '+' (1, 10) (1, 11)
261 STRING "'ABC'" (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000262 >>> dump_tokens('y = "ABC" + "ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000263 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000264 NAME 'y' (1, 0) (1, 1)
265 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000266 STRING '"ABC"' (1, 4) (1, 9)
267 OP '+' (1, 10) (1, 11)
268 STRING '"ABC"' (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000269 >>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000270 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000271 NAME 'x' (1, 0) (1, 1)
272 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000273 STRING "r'abc'" (1, 4) (1, 10)
274 OP '+' (1, 11) (1, 12)
275 STRING "r'ABC'" (1, 13) (1, 19)
276 OP '+' (1, 20) (1, 21)
277 STRING "R'ABC'" (1, 22) (1, 28)
278 OP '+' (1, 29) (1, 30)
279 STRING "R'ABC'" (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000280 >>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000281 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000282 NAME 'y' (1, 0) (1, 1)
283 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000284 STRING 'r"abc"' (1, 4) (1, 10)
285 OP '+' (1, 11) (1, 12)
286 STRING 'r"ABC"' (1, 13) (1, 19)
287 OP '+' (1, 20) (1, 21)
288 STRING 'R"ABC"' (1, 22) (1, 28)
289 OP '+' (1, 29) (1, 30)
290 STRING 'R"ABC"' (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000291
Meador Inge8d5c0b82012-06-16 21:49:08 -0500292 >>> dump_tokens("u'abc' + U'abc'")
293 ENCODING 'utf-8' (0, 0) (0, 0)
294 STRING "u'abc'" (1, 0) (1, 6)
295 OP '+' (1, 7) (1, 8)
296 STRING "U'abc'" (1, 9) (1, 15)
297 >>> dump_tokens('u"abc" + U"abc"')
298 ENCODING 'utf-8' (0, 0) (0, 0)
299 STRING 'u"abc"' (1, 0) (1, 6)
300 OP '+' (1, 7) (1, 8)
301 STRING 'U"abc"' (1, 9) (1, 15)
302 >>> dump_tokens("ur'abc' + uR'abc' + Ur'abc' + UR'abc'")
303 ENCODING 'utf-8' (0, 0) (0, 0)
304 STRING "ur'abc'" (1, 0) (1, 7)
305 OP '+' (1, 8) (1, 9)
306 STRING "uR'abc'" (1, 10) (1, 17)
307 OP '+' (1, 18) (1, 19)
308 STRING "Ur'abc'" (1, 20) (1, 27)
309 OP '+' (1, 28) (1, 29)
310 STRING "UR'abc'" (1, 30) (1, 37)
311 >>> dump_tokens('ur"abc" + uR"abc" + Ur"abc" + UR"abc"')
312 ENCODING 'utf-8' (0, 0) (0, 0)
313 STRING 'ur"abc"' (1, 0) (1, 7)
314 OP '+' (1, 8) (1, 9)
315 STRING 'uR"abc"' (1, 10) (1, 17)
316 OP '+' (1, 18) (1, 19)
317 STRING 'Ur"abc"' (1, 20) (1, 27)
318 OP '+' (1, 28) (1, 29)
319 STRING 'UR"abc"' (1, 30) (1, 37)
320
321 >>> dump_tokens("b'abc' + B'abc'")
322 ENCODING 'utf-8' (0, 0) (0, 0)
323 STRING "b'abc'" (1, 0) (1, 6)
324 OP '+' (1, 7) (1, 8)
325 STRING "B'abc'" (1, 9) (1, 15)
326 >>> dump_tokens('b"abc" + B"abc"')
327 ENCODING 'utf-8' (0, 0) (0, 0)
328 STRING 'b"abc"' (1, 0) (1, 6)
329 OP '+' (1, 7) (1, 8)
330 STRING 'B"abc"' (1, 9) (1, 15)
331 >>> dump_tokens("br'abc' + bR'abc' + Br'abc' + BR'abc'")
332 ENCODING 'utf-8' (0, 0) (0, 0)
333 STRING "br'abc'" (1, 0) (1, 7)
334 OP '+' (1, 8) (1, 9)
335 STRING "bR'abc'" (1, 10) (1, 17)
336 OP '+' (1, 18) (1, 19)
337 STRING "Br'abc'" (1, 20) (1, 27)
338 OP '+' (1, 28) (1, 29)
339 STRING "BR'abc'" (1, 30) (1, 37)
340 >>> dump_tokens('br"abc" + bR"abc" + Br"abc" + BR"abc"')
341 ENCODING 'utf-8' (0, 0) (0, 0)
342 STRING 'br"abc"' (1, 0) (1, 7)
343 OP '+' (1, 8) (1, 9)
344 STRING 'bR"abc"' (1, 10) (1, 17)
345 OP '+' (1, 18) (1, 19)
346 STRING 'Br"abc"' (1, 20) (1, 27)
347 OP '+' (1, 28) (1, 29)
348 STRING 'BR"abc"' (1, 30) (1, 37)
349 >>> dump_tokens("rb'abc' + rB'abc' + Rb'abc' + RB'abc'")
350 ENCODING 'utf-8' (0, 0) (0, 0)
351 STRING "rb'abc'" (1, 0) (1, 7)
352 OP '+' (1, 8) (1, 9)
353 STRING "rB'abc'" (1, 10) (1, 17)
354 OP '+' (1, 18) (1, 19)
355 STRING "Rb'abc'" (1, 20) (1, 27)
356 OP '+' (1, 28) (1, 29)
357 STRING "RB'abc'" (1, 30) (1, 37)
358 >>> dump_tokens('rb"abc" + rB"abc" + Rb"abc" + RB"abc"')
359 ENCODING 'utf-8' (0, 0) (0, 0)
360 STRING 'rb"abc"' (1, 0) (1, 7)
361 OP '+' (1, 8) (1, 9)
362 STRING 'rB"abc"' (1, 10) (1, 17)
363 OP '+' (1, 18) (1, 19)
364 STRING 'Rb"abc"' (1, 20) (1, 27)
365 OP '+' (1, 28) (1, 29)
366 STRING 'RB"abc"' (1, 30) (1, 37)
367
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000368Operators
369
370 >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000371 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000372 NAME 'def' (1, 0) (1, 3)
373 NAME 'd22' (1, 4) (1, 7)
374 OP '(' (1, 7) (1, 8)
375 NAME 'a' (1, 8) (1, 9)
376 OP ',' (1, 9) (1, 10)
377 NAME 'b' (1, 11) (1, 12)
378 OP ',' (1, 12) (1, 13)
379 NAME 'c' (1, 14) (1, 15)
380 OP '=' (1, 15) (1, 16)
381 NUMBER '2' (1, 16) (1, 17)
382 OP ',' (1, 17) (1, 18)
383 NAME 'd' (1, 19) (1, 20)
384 OP '=' (1, 20) (1, 21)
385 NUMBER '2' (1, 21) (1, 22)
386 OP ',' (1, 22) (1, 23)
387 OP '*' (1, 24) (1, 25)
388 NAME 'k' (1, 25) (1, 26)
389 OP ')' (1, 26) (1, 27)
390 OP ':' (1, 27) (1, 28)
391 NAME 'pass' (1, 29) (1, 33)
392 >>> dump_tokens("def d01v_(a=1, *k, **w): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000393 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000394 NAME 'def' (1, 0) (1, 3)
395 NAME 'd01v_' (1, 4) (1, 9)
396 OP '(' (1, 9) (1, 10)
397 NAME 'a' (1, 10) (1, 11)
398 OP '=' (1, 11) (1, 12)
399 NUMBER '1' (1, 12) (1, 13)
400 OP ',' (1, 13) (1, 14)
401 OP '*' (1, 15) (1, 16)
402 NAME 'k' (1, 16) (1, 17)
403 OP ',' (1, 17) (1, 18)
404 OP '**' (1, 19) (1, 21)
405 NAME 'w' (1, 21) (1, 22)
406 OP ')' (1, 22) (1, 23)
407 OP ':' (1, 23) (1, 24)
408 NAME 'pass' (1, 25) (1, 29)
409
410Comparison
411
412 >>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +
413 ... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")
Trent Nelson428de652008-03-18 22:41:35 +0000414 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000415 NAME 'if' (1, 0) (1, 2)
416 NUMBER '1' (1, 3) (1, 4)
417 OP '<' (1, 5) (1, 6)
418 NUMBER '1' (1, 7) (1, 8)
419 OP '>' (1, 9) (1, 10)
420 NUMBER '1' (1, 11) (1, 12)
421 OP '==' (1, 13) (1, 15)
422 NUMBER '1' (1, 16) (1, 17)
423 OP '>=' (1, 18) (1, 20)
424 NUMBER '5' (1, 21) (1, 22)
425 OP '<=' (1, 23) (1, 25)
426 NUMBER '0x15' (1, 26) (1, 30)
427 OP '<=' (1, 31) (1, 33)
428 NUMBER '0x12' (1, 34) (1, 38)
429 OP '!=' (1, 39) (1, 41)
430 NUMBER '1' (1, 42) (1, 43)
431 NAME 'and' (1, 44) (1, 47)
432 NUMBER '5' (1, 48) (1, 49)
433 NAME 'in' (1, 50) (1, 52)
434 NUMBER '1' (1, 53) (1, 54)
435 NAME 'not' (1, 55) (1, 58)
436 NAME 'in' (1, 59) (1, 61)
437 NUMBER '1' (1, 62) (1, 63)
438 NAME 'is' (1, 64) (1, 66)
439 NUMBER '1' (1, 67) (1, 68)
440 NAME 'or' (1, 69) (1, 71)
441 NUMBER '5' (1, 72) (1, 73)
442 NAME 'is' (1, 74) (1, 76)
443 NAME 'not' (1, 77) (1, 80)
444 NUMBER '1' (1, 81) (1, 82)
445 OP ':' (1, 82) (1, 83)
446 NAME 'pass' (1, 84) (1, 88)
447
448Shift
449
450 >>> dump_tokens("x = 1 << 1 >> 5")
Trent Nelson428de652008-03-18 22:41:35 +0000451 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000452 NAME 'x' (1, 0) (1, 1)
453 OP '=' (1, 2) (1, 3)
454 NUMBER '1' (1, 4) (1, 5)
455 OP '<<' (1, 6) (1, 8)
456 NUMBER '1' (1, 9) (1, 10)
457 OP '>>' (1, 11) (1, 13)
458 NUMBER '5' (1, 14) (1, 15)
459
460Additive
461
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000462 >>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")
Trent Nelson428de652008-03-18 22:41:35 +0000463 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000464 NAME 'x' (1, 0) (1, 1)
465 OP '=' (1, 2) (1, 3)
466 NUMBER '1' (1, 4) (1, 5)
467 OP '-' (1, 6) (1, 7)
468 NAME 'y' (1, 8) (1, 9)
469 OP '+' (1, 10) (1, 11)
470 NUMBER '15' (1, 12) (1, 14)
471 OP '-' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000472 NUMBER '1' (1, 17) (1, 18)
473 OP '+' (1, 19) (1, 20)
474 NUMBER '0x124' (1, 21) (1, 26)
475 OP '+' (1, 27) (1, 28)
476 NAME 'z' (1, 29) (1, 30)
477 OP '+' (1, 31) (1, 32)
478 NAME 'a' (1, 33) (1, 34)
479 OP '[' (1, 34) (1, 35)
480 NUMBER '5' (1, 35) (1, 36)
481 OP ']' (1, 36) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000482
483Multiplicative
484
485 >>> dump_tokens("x = 1//1*1/5*12%0x12")
Trent Nelson428de652008-03-18 22:41:35 +0000486 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000487 NAME 'x' (1, 0) (1, 1)
488 OP '=' (1, 2) (1, 3)
489 NUMBER '1' (1, 4) (1, 5)
490 OP '//' (1, 5) (1, 7)
491 NUMBER '1' (1, 7) (1, 8)
492 OP '*' (1, 8) (1, 9)
493 NUMBER '1' (1, 9) (1, 10)
494 OP '/' (1, 10) (1, 11)
495 NUMBER '5' (1, 11) (1, 12)
496 OP '*' (1, 12) (1, 13)
497 NUMBER '12' (1, 13) (1, 15)
498 OP '%' (1, 15) (1, 16)
499 NUMBER '0x12' (1, 16) (1, 20)
500
501Unary
502
503 >>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")
Trent Nelson428de652008-03-18 22:41:35 +0000504 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000505 OP '~' (1, 0) (1, 1)
506 NUMBER '1' (1, 1) (1, 2)
507 OP '^' (1, 3) (1, 4)
508 NUMBER '1' (1, 5) (1, 6)
509 OP '&' (1, 7) (1, 8)
510 NUMBER '1' (1, 9) (1, 10)
511 OP '|' (1, 11) (1, 12)
512 NUMBER '1' (1, 12) (1, 13)
513 OP '^' (1, 14) (1, 15)
514 OP '-' (1, 16) (1, 17)
515 NUMBER '1' (1, 17) (1, 18)
516 >>> dump_tokens("-1*1/1+1*1//1 - ---1**1")
Trent Nelson428de652008-03-18 22:41:35 +0000517 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000518 OP '-' (1, 0) (1, 1)
519 NUMBER '1' (1, 1) (1, 2)
520 OP '*' (1, 2) (1, 3)
521 NUMBER '1' (1, 3) (1, 4)
522 OP '/' (1, 4) (1, 5)
523 NUMBER '1' (1, 5) (1, 6)
524 OP '+' (1, 6) (1, 7)
525 NUMBER '1' (1, 7) (1, 8)
526 OP '*' (1, 8) (1, 9)
527 NUMBER '1' (1, 9) (1, 10)
528 OP '//' (1, 10) (1, 12)
529 NUMBER '1' (1, 12) (1, 13)
530 OP '-' (1, 14) (1, 15)
531 OP '-' (1, 16) (1, 17)
532 OP '-' (1, 17) (1, 18)
533 OP '-' (1, 18) (1, 19)
534 NUMBER '1' (1, 19) (1, 20)
535 OP '**' (1, 20) (1, 22)
536 NUMBER '1' (1, 22) (1, 23)
537
538Selector
539
540 >>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")
Trent Nelson428de652008-03-18 22:41:35 +0000541 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000542 NAME 'import' (1, 0) (1, 6)
543 NAME 'sys' (1, 7) (1, 10)
544 OP ',' (1, 10) (1, 11)
545 NAME 'time' (1, 12) (1, 16)
546 NEWLINE '\\n' (1, 16) (1, 17)
547 NAME 'x' (2, 0) (2, 1)
548 OP '=' (2, 2) (2, 3)
549 NAME 'sys' (2, 4) (2, 7)
550 OP '.' (2, 7) (2, 8)
551 NAME 'modules' (2, 8) (2, 15)
552 OP '[' (2, 15) (2, 16)
553 STRING "'time'" (2, 16) (2, 22)
554 OP ']' (2, 22) (2, 23)
555 OP '.' (2, 23) (2, 24)
556 NAME 'time' (2, 24) (2, 28)
557 OP '(' (2, 28) (2, 29)
558 OP ')' (2, 29) (2, 30)
559
560Methods
561
562 >>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000563 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000564 OP '@' (1, 0) (1, 1)
565 NAME 'staticmethod (1, 1) (1, 13)
566 NEWLINE '\\n' (1, 13) (1, 14)
567 NAME 'def' (2, 0) (2, 3)
568 NAME 'foo' (2, 4) (2, 7)
569 OP '(' (2, 7) (2, 8)
570 NAME 'x' (2, 8) (2, 9)
571 OP ',' (2, 9) (2, 10)
572 NAME 'y' (2, 10) (2, 11)
573 OP ')' (2, 11) (2, 12)
574 OP ':' (2, 12) (2, 13)
575 NAME 'pass' (2, 14) (2, 18)
576
577Backslash means line continuation, except for comments
578
579 >>> roundtrip("x=1+\\\\n"
580 ... "1\\n"
581 ... "# This is a comment\\\\n"
582 ... "# This also\\n")
583 True
584 >>> roundtrip("# Comment \\\\nx = 0")
585 True
Christian Heimesba4af492008-03-28 00:55:15 +0000586
587Two string literals on the same line
588
589 >>> roundtrip("'' ''")
590 True
591
592Test roundtrip on random python modules.
Antoine Pitrou5bc4fa72010-10-14 15:34:31 +0000593pass the '-ucpu' option to process the full directory.
Christian Heimesba4af492008-03-28 00:55:15 +0000594
595 >>> import random
596 >>> tempdir = os.path.dirname(f) or os.curdir
597 >>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
598
Benjamin Peterson963e4022011-08-13 00:33:21 -0500599tokenize is broken on test_pep3131.py because regular expressions are broken on
600the obscure unicode identifiers in it. *sigh*
601 >>> testfiles.remove(os.path.join(tempdir, "test_pep3131.py"))
Antoine Pitrou5bc4fa72010-10-14 15:34:31 +0000602 >>> if not support.is_resource_enabled("cpu"):
Christian Heimesba4af492008-03-28 00:55:15 +0000603 ... testfiles = random.sample(testfiles, 10)
604 ...
605 >>> for testfile in testfiles:
606 ... if not roundtrip(open(testfile, 'rb')):
607 ... print("Roundtrip failed for file %s" % testfile)
608 ... break
609 ... else: True
610 True
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000611
612Evil tabs
Benjamin Peterson33856de2010-08-30 14:41:20 +0000613
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000614 >>> dump_tokens("def f():\\n\\tif x\\n \\tpass")
615 ENCODING 'utf-8' (0, 0) (0, 0)
616 NAME 'def' (1, 0) (1, 3)
617 NAME 'f' (1, 4) (1, 5)
618 OP '(' (1, 5) (1, 6)
619 OP ')' (1, 6) (1, 7)
620 OP ':' (1, 7) (1, 8)
621 NEWLINE '\\n' (1, 8) (1, 9)
622 INDENT '\\t' (2, 0) (2, 1)
623 NAME 'if' (2, 1) (2, 3)
624 NAME 'x' (2, 4) (2, 5)
625 NEWLINE '\\n' (2, 5) (2, 6)
626 INDENT ' \\t' (3, 0) (3, 9)
627 NAME 'pass' (3, 9) (3, 13)
628 DEDENT '' (4, 0) (4, 0)
629 DEDENT '' (4, 0) (4, 0)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000630
631Non-ascii identifiers
632
633 >>> dump_tokens("Örter = 'places'\\ngrün = 'green'")
634 ENCODING 'utf-8' (0, 0) (0, 0)
635 NAME 'Örter' (1, 0) (1, 5)
636 OP '=' (1, 6) (1, 7)
637 STRING "'places'" (1, 8) (1, 16)
638 NEWLINE '\\n' (1, 16) (1, 17)
639 NAME 'grün' (2, 0) (2, 4)
640 OP '=' (2, 5) (2, 6)
641 STRING "'green'" (2, 7) (2, 14)
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000642
643Legacy unicode literals:
644
645 >>> dump_tokens("Örter = u'places'\\ngrün = UR'green'")
646 ENCODING 'utf-8' (0, 0) (0, 0)
647 NAME 'Örter' (1, 0) (1, 5)
648 OP '=' (1, 6) (1, 7)
649 STRING "u'places'" (1, 8) (1, 17)
650 NEWLINE '\\n' (1, 17) (1, 18)
651 NAME 'grün' (2, 0) (2, 4)
652 OP '=' (2, 5) (2, 6)
653 STRING "UR'green'" (2, 7) (2, 16)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000654"""
655
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000656from test import support
Trent Nelson428de652008-03-18 22:41:35 +0000657from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
Meador Inge00c7f852012-01-19 00:44:45 -0600658 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
Victor Stinner58c07522010-11-09 01:08:59 +0000659 open as tokenize_open)
Trent Nelson428de652008-03-18 22:41:35 +0000660from io import BytesIO
661from unittest import TestCase
662import os, sys, glob
Meador Inge00c7f852012-01-19 00:44:45 -0600663import token
Raymond Hettinger68c04532005-06-10 11:05:19 +0000664
Thomas Wouters89f507f2006-12-13 04:49:30 +0000665def dump_tokens(s):
666 """Print out the tokens in s in a table format.
667
668 The ENDMARKER is omitted.
669 """
Trent Nelson428de652008-03-18 22:41:35 +0000670 f = BytesIO(s.encode('utf-8'))
671 for type, token, start, end, line in tokenize(f.readline):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000672 if type == ENDMARKER:
673 break
674 type = tok_name[type]
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000675 print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters89f507f2006-12-13 04:49:30 +0000676
Trent Nelson428de652008-03-18 22:41:35 +0000677def roundtrip(f):
678 """
679 Test roundtrip for `untokenize`. `f` is an open file or a string.
680 The source code in f is tokenized, converted back to source code via
681 tokenize.untokenize(), and tokenized again from the latter. The test
682 fails if the second tokenization doesn't match the first.
683 """
684 if isinstance(f, str):
685 f = BytesIO(f.encode('utf-8'))
Brian Curtin9f5f65c2010-10-30 21:35:28 +0000686 try:
687 token_list = list(tokenize(f.readline))
688 finally:
689 f.close()
Trent Nelson428de652008-03-18 22:41:35 +0000690 tokens1 = [tok[:2] for tok in token_list]
691 new_bytes = untokenize(tokens1)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300692 readline = (line for line in new_bytes.splitlines(keepends=True)).__next__
Trent Nelson428de652008-03-18 22:41:35 +0000693 tokens2 = [tok[:2] for tok in tokenize(readline)]
694 return tokens1 == tokens2
Thomas Wouters89f507f2006-12-13 04:49:30 +0000695
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000696# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000697def decistmt(s):
698 """Substitute Decimals for floats in a string of statements.
699
700 >>> from decimal import Decimal
Georg Brandl88fc6642007-02-09 21:28:07 +0000701 >>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000702 >>> decistmt(s)
Georg Brandl88fc6642007-02-09 21:28:07 +0000703 "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger68c04532005-06-10 11:05:19 +0000704
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000705 The format of the exponent is inherited from the platform C library.
706 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
Mark Dickinson388122d2010-08-04 20:56:28 +0000707 we're only showing 11 digits, and the 12th isn't close to 5, the
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000708 rest of the output should be platform-independent.
709
710 >>> exec(s) #doctest: +ELLIPSIS
Mark Dickinson388122d2010-08-04 20:56:28 +0000711 -3.2171603427...e-0...7
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000712
713 Output from calculations with Decimal should be identical across all
714 platforms.
715
Raymond Hettinger68c04532005-06-10 11:05:19 +0000716 >>> exec(decistmt(s))
717 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +0000718 """
719 result = []
Trent Nelson428de652008-03-18 22:41:35 +0000720 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
Raymond Hettinger68c04532005-06-10 11:05:19 +0000721 for toknum, tokval, _, _, _ in g:
722 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
723 result.extend([
724 (NAME, 'Decimal'),
725 (OP, '('),
726 (STRING, repr(tokval)),
727 (OP, ')')
728 ])
729 else:
730 result.append((toknum, tokval))
Trent Nelson428de652008-03-18 22:41:35 +0000731 return untokenize(result).decode('utf-8')
732
733
734class TestTokenizerAdheresToPep0263(TestCase):
735 """
736 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
737 """
738
739 def _testFile(self, filename):
740 path = os.path.join(os.path.dirname(__file__), filename)
741 return roundtrip(open(path, 'rb'))
742
743 def test_utf8_coding_cookie_and_no_utf8_bom(self):
Ned Deily2ea6fcc2011-07-19 16:15:27 -0700744 f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
Trent Nelson428de652008-03-18 22:41:35 +0000745 self.assertTrue(self._testFile(f))
746
747 def test_latin1_coding_cookie_and_utf8_bom(self):
748 """
749 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
750 allowed encoding for the comment is 'utf-8'. The text file used in
751 this test starts with a BOM signature, but specifies latin1 as the
752 coding, so verify that a SyntaxError is raised, which matches the
753 behaviour of the interpreter when it encounters a similar condition.
754 """
755 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000756 self.assertRaises(SyntaxError, self._testFile, f)
Trent Nelson428de652008-03-18 22:41:35 +0000757
758 def test_no_coding_cookie_and_utf8_bom(self):
759 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
760 self.assertTrue(self._testFile(f))
761
762 def test_utf8_coding_cookie_and_utf8_bom(self):
763 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
764 self.assertTrue(self._testFile(f))
765
766
767class Test_Tokenize(TestCase):
768
769 def test__tokenize_decodes_with_specified_encoding(self):
770 literal = '"ЉЊЈЁЂ"'
771 line = literal.encode('utf-8')
772 first = False
773 def readline():
774 nonlocal first
775 if not first:
776 first = True
777 return line
778 else:
779 return b''
780
781 # skip the initial encoding token and the end token
782 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
783 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000784 self.assertEqual(tokens, expected_tokens,
785 "bytes not decoded with encoding")
Trent Nelson428de652008-03-18 22:41:35 +0000786
787 def test__tokenize_does_not_decode_with_encoding_none(self):
788 literal = '"ЉЊЈЁЂ"'
789 first = False
790 def readline():
791 nonlocal first
792 if not first:
793 first = True
794 return literal
795 else:
796 return b''
797
798 # skip the end token
799 tokens = list(_tokenize(readline, encoding=None))[:-1]
800 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000801 self.assertEqual(tokens, expected_tokens,
802 "string not tokenized when encoding is None")
Trent Nelson428de652008-03-18 22:41:35 +0000803
804
805class TestDetectEncoding(TestCase):
806
807 def get_readline(self, lines):
808 index = 0
809 def readline():
810 nonlocal index
811 if index == len(lines):
812 raise StopIteration
813 line = lines[index]
814 index += 1
815 return line
816 return readline
817
818 def test_no_bom_no_encoding_cookie(self):
819 lines = (
820 b'# something\n',
821 b'print(something)\n',
822 b'do_something(else)\n'
823 )
824 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000825 self.assertEqual(encoding, 'utf-8')
826 self.assertEqual(consumed_lines, list(lines[:2]))
Trent Nelson428de652008-03-18 22:41:35 +0000827
828 def test_bom_no_cookie(self):
829 lines = (
830 b'\xef\xbb\xbf# something\n',
831 b'print(something)\n',
832 b'do_something(else)\n'
833 )
834 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000835 self.assertEqual(encoding, 'utf-8-sig')
836 self.assertEqual(consumed_lines,
837 [b'# something\n', b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000838
839 def test_cookie_first_line_no_bom(self):
840 lines = (
841 b'# -*- coding: latin-1 -*-\n',
842 b'print(something)\n',
843 b'do_something(else)\n'
844 )
845 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000846 self.assertEqual(encoding, 'iso-8859-1')
847 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000848
849 def test_matched_bom_and_cookie_first_line(self):
850 lines = (
851 b'\xef\xbb\xbf# coding=utf-8\n',
852 b'print(something)\n',
853 b'do_something(else)\n'
854 )
855 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000856 self.assertEqual(encoding, 'utf-8-sig')
857 self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000858
859 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
860 lines = (
861 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
862 b'print(something)\n',
863 b'do_something(else)\n'
864 )
865 readline = self.get_readline(lines)
866 self.assertRaises(SyntaxError, detect_encoding, readline)
867
868 def test_cookie_second_line_no_bom(self):
869 lines = (
870 b'#! something\n',
871 b'# vim: set fileencoding=ascii :\n',
872 b'print(something)\n',
873 b'do_something(else)\n'
874 )
875 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000876 self.assertEqual(encoding, 'ascii')
Trent Nelson428de652008-03-18 22:41:35 +0000877 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
Ezio Melottib3aedd42010-11-20 19:04:17 +0000878 self.assertEqual(consumed_lines, expected)
Trent Nelson428de652008-03-18 22:41:35 +0000879
880 def test_matched_bom_and_cookie_second_line(self):
881 lines = (
882 b'\xef\xbb\xbf#! something\n',
883 b'f# coding=utf-8\n',
884 b'print(something)\n',
885 b'do_something(else)\n'
886 )
887 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000888 self.assertEqual(encoding, 'utf-8-sig')
889 self.assertEqual(consumed_lines,
890 [b'#! something\n', b'f# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000891
892 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
893 lines = (
894 b'\xef\xbb\xbf#! something\n',
895 b'# vim: set fileencoding=ascii :\n',
896 b'print(something)\n',
897 b'do_something(else)\n'
898 )
899 readline = self.get_readline(lines)
900 self.assertRaises(SyntaxError, detect_encoding, readline)
901
Benjamin Petersond3afada2009-10-09 21:43:09 +0000902 def test_latin1_normalization(self):
903 # See get_normal_name() in tokenizer.c.
904 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
905 "iso-8859-1-unix", "iso-latin-1-mac")
906 for encoding in encodings:
907 for rep in ("-", "_"):
908 enc = encoding.replace("-", rep)
909 lines = (b"#!/usr/bin/python\n",
910 b"# coding: " + enc.encode("ascii") + b"\n",
911 b"print(things)\n",
912 b"do_something += 4\n")
913 rl = self.get_readline(lines)
914 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000915 self.assertEqual(found, "iso-8859-1")
Benjamin Petersond3afada2009-10-09 21:43:09 +0000916
Martin v. Löwis63674f42012-04-20 14:36:47 +0200917 def test_syntaxerror_latin1(self):
918 # Issue 14629: need to raise SyntaxError if the first
919 # line(s) have non-UTF-8 characters
920 lines = (
921 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
922 )
923 readline = self.get_readline(lines)
924 self.assertRaises(SyntaxError, detect_encoding, readline)
925
926
Benjamin Petersond3afada2009-10-09 21:43:09 +0000927 def test_utf8_normalization(self):
928 # See get_normal_name() in tokenizer.c.
929 encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
930 for encoding in encodings:
931 for rep in ("-", "_"):
932 enc = encoding.replace("-", rep)
933 lines = (b"#!/usr/bin/python\n",
934 b"# coding: " + enc.encode("ascii") + b"\n",
935 b"1 + 3\n")
936 rl = self.get_readline(lines)
937 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000938 self.assertEqual(found, "utf-8")
Benjamin Petersond3afada2009-10-09 21:43:09 +0000939
Trent Nelson428de652008-03-18 22:41:35 +0000940 def test_short_files(self):
941 readline = self.get_readline((b'print(something)\n',))
942 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000943 self.assertEqual(encoding, 'utf-8')
944 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000945
946 encoding, consumed_lines = detect_encoding(self.get_readline(()))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000947 self.assertEqual(encoding, 'utf-8')
948 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +0000949
950 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
951 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000952 self.assertEqual(encoding, 'utf-8-sig')
953 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000954
955 readline = self.get_readline((b'\xef\xbb\xbf',))
956 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000957 self.assertEqual(encoding, 'utf-8-sig')
958 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +0000959
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000960 readline = self.get_readline((b'# coding: bad\n',))
961 self.assertRaises(SyntaxError, detect_encoding, readline)
Trent Nelson428de652008-03-18 22:41:35 +0000962
Victor Stinner58c07522010-11-09 01:08:59 +0000963 def test_open(self):
964 filename = support.TESTFN + '.py'
965 self.addCleanup(support.unlink, filename)
966
967 # test coding cookie
968 for encoding in ('iso-8859-15', 'utf-8'):
969 with open(filename, 'w', encoding=encoding) as fp:
970 print("# coding: %s" % encoding, file=fp)
971 print("print('euro:\u20ac')", file=fp)
972 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +0000973 self.assertEqual(fp.encoding, encoding)
974 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +0000975
976 # test BOM (no coding cookie)
977 with open(filename, 'w', encoding='utf-8-sig') as fp:
978 print("print('euro:\u20ac')", file=fp)
979 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +0000980 self.assertEqual(fp.encoding, 'utf-8-sig')
981 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +0000982
Brett Cannonc33f3f22012-04-20 13:23:54 -0400983 def test_filename_in_exception(self):
984 # When possible, include the file name in the exception.
985 path = 'some_file_path'
986 lines = (
987 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
988 )
989 class Bunk:
990 def __init__(self, lines, path):
991 self.name = path
992 self._lines = lines
993 self._index = 0
994
995 def readline(self):
996 if self._index == len(lines):
997 raise StopIteration
998 line = lines[self._index]
999 self._index += 1
1000 return line
1001
1002 with self.assertRaises(SyntaxError):
1003 ins = Bunk(lines, path)
1004 # Make sure lacking a name isn't an issue.
1005 del ins.name
1006 detect_encoding(ins.readline)
1007 with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
1008 ins = Bunk(lines, path)
1009 detect_encoding(ins.readline)
1010
1011
Trent Nelson428de652008-03-18 22:41:35 +00001012class TestTokenize(TestCase):
1013
1014 def test_tokenize(self):
1015 import tokenize as tokenize_module
1016 encoding = object()
1017 encoding_used = None
1018 def mock_detect_encoding(readline):
1019 return encoding, ['first', 'second']
1020
1021 def mock__tokenize(readline, encoding):
1022 nonlocal encoding_used
1023 encoding_used = encoding
1024 out = []
1025 while True:
1026 next_line = readline()
1027 if next_line:
1028 out.append(next_line)
1029 continue
1030 return out
1031
1032 counter = 0
1033 def mock_readline():
1034 nonlocal counter
1035 counter += 1
1036 if counter == 5:
1037 return b''
1038 return counter
1039
1040 orig_detect_encoding = tokenize_module.detect_encoding
1041 orig__tokenize = tokenize_module._tokenize
1042 tokenize_module.detect_encoding = mock_detect_encoding
1043 tokenize_module._tokenize = mock__tokenize
1044 try:
1045 results = tokenize(mock_readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001046 self.assertEqual(list(results), ['first', 'second', 1, 2, 3, 4])
Trent Nelson428de652008-03-18 22:41:35 +00001047 finally:
1048 tokenize_module.detect_encoding = orig_detect_encoding
1049 tokenize_module._tokenize = orig__tokenize
1050
1051 self.assertTrue(encoding_used, encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +00001052
Meador Inge00c7f852012-01-19 00:44:45 -06001053 def assertExactTypeEqual(self, opstr, *optypes):
1054 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
1055 num_optypes = len(optypes)
1056 self.assertEqual(len(tokens), 2 + num_optypes)
1057 self.assertEqual(token.tok_name[tokens[0].exact_type],
1058 token.tok_name[ENCODING])
1059 for i in range(num_optypes):
1060 self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
1061 token.tok_name[optypes[i]])
1062 self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
1063 token.tok_name[token.ENDMARKER])
1064
1065 def test_exact_type(self):
1066 self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
1067 self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
1068 self.assertExactTypeEqual(':', token.COLON)
1069 self.assertExactTypeEqual(',', token.COMMA)
1070 self.assertExactTypeEqual(';', token.SEMI)
1071 self.assertExactTypeEqual('+', token.PLUS)
1072 self.assertExactTypeEqual('-', token.MINUS)
1073 self.assertExactTypeEqual('*', token.STAR)
1074 self.assertExactTypeEqual('/', token.SLASH)
1075 self.assertExactTypeEqual('|', token.VBAR)
1076 self.assertExactTypeEqual('&', token.AMPER)
1077 self.assertExactTypeEqual('<', token.LESS)
1078 self.assertExactTypeEqual('>', token.GREATER)
1079 self.assertExactTypeEqual('=', token.EQUAL)
1080 self.assertExactTypeEqual('.', token.DOT)
1081 self.assertExactTypeEqual('%', token.PERCENT)
1082 self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
1083 self.assertExactTypeEqual('==', token.EQEQUAL)
1084 self.assertExactTypeEqual('!=', token.NOTEQUAL)
1085 self.assertExactTypeEqual('<=', token.LESSEQUAL)
1086 self.assertExactTypeEqual('>=', token.GREATEREQUAL)
1087 self.assertExactTypeEqual('~', token.TILDE)
1088 self.assertExactTypeEqual('^', token.CIRCUMFLEX)
1089 self.assertExactTypeEqual('<<', token.LEFTSHIFT)
1090 self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
1091 self.assertExactTypeEqual('**', token.DOUBLESTAR)
1092 self.assertExactTypeEqual('+=', token.PLUSEQUAL)
1093 self.assertExactTypeEqual('-=', token.MINEQUAL)
1094 self.assertExactTypeEqual('*=', token.STAREQUAL)
1095 self.assertExactTypeEqual('/=', token.SLASHEQUAL)
1096 self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
1097 self.assertExactTypeEqual('&=', token.AMPEREQUAL)
1098 self.assertExactTypeEqual('|=', token.VBAREQUAL)
1099 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1100 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1101 self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
1102 self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
1103 self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
1104 self.assertExactTypeEqual('//', token.DOUBLESLASH)
1105 self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
1106 self.assertExactTypeEqual('@', token.AT)
1107
1108 self.assertExactTypeEqual('a**2+b**2==c**2',
1109 NAME, token.DOUBLESTAR, NUMBER,
1110 token.PLUS,
1111 NAME, token.DOUBLESTAR, NUMBER,
1112 token.EQEQUAL,
1113 NAME, token.DOUBLESTAR, NUMBER)
1114 self.assertExactTypeEqual('{1, 2, 3}',
1115 token.LBRACE,
1116 token.NUMBER, token.COMMA,
1117 token.NUMBER, token.COMMA,
1118 token.NUMBER,
1119 token.RBRACE)
1120 self.assertExactTypeEqual('^(x & 0x1)',
1121 token.CIRCUMFLEX,
1122 token.LPAR,
1123 token.NAME, token.AMPER, token.NUMBER,
1124 token.RPAR)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001125
1126__test__ = {"doctests" : doctests, 'decistmt': decistmt}
1127
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001128def test_main():
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001129 from test import test_tokenize
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001130 support.run_doctest(test_tokenize, True)
1131 support.run_unittest(TestTokenizerAdheresToPep0263)
1132 support.run_unittest(Test_Tokenize)
1133 support.run_unittest(TestDetectEncoding)
1134 support.run_unittest(TestTokenize)
Neal Norwitzc1505362006-12-28 06:47:50 +00001135
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001136if __name__ == "__main__":
1137 test_main()