blob: 6506b671a1e140479995e20c96e77a8fa7cb46ea [file] [log] [blame]
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001doctests = """
2Tests for the tokenize module.
Thomas Wouters89f507f2006-12-13 04:49:30 +00003
Christian Heimesdd15f6c2008-03-16 00:07:10 +00004The tests can be really simple. Given a small fragment of source
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05005code, print out a table with tokens. The ENDMARKER is omitted for
Thomas Wouters89f507f2006-12-13 04:49:30 +00006brevity.
7
Jason R. Coombs7cf36382015-06-20 19:13:50 -04008 >>> import glob
9
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010 >>> dump_tokens("1 + 1")
Trent Nelson428de652008-03-18 22:41:35 +000011 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000012 NUMBER '1' (1, 0) (1, 1)
13 OP '+' (1, 2) (1, 3)
14 NUMBER '1' (1, 4) (1, 5)
Thomas Wouters89f507f2006-12-13 04:49:30 +000015
Christian Heimesdd15f6c2008-03-16 00:07:10 +000016 >>> dump_tokens("if False:\\n"
17 ... " # NL\\n"
18 ... " True = False # NEWLINE\\n")
Trent Nelson428de652008-03-18 22:41:35 +000019 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +000020 NAME 'if' (1, 0) (1, 2)
21 NAME 'False' (1, 3) (1, 8)
22 OP ':' (1, 8) (1, 9)
23 NEWLINE '\\n' (1, 9) (1, 10)
24 COMMENT '# NL' (2, 4) (2, 8)
25 NL '\\n' (2, 8) (2, 9)
26 INDENT ' ' (3, 0) (3, 4)
27 NAME 'True' (3, 4) (3, 8)
28 OP '=' (3, 9) (3, 10)
29 NAME 'False' (3, 11) (3, 16)
30 COMMENT '# NEWLINE' (3, 17) (3, 26)
31 NEWLINE '\\n' (3, 26) (3, 27)
32 DEDENT '' (4, 0) (4, 0)
Thomas Wouters89f507f2006-12-13 04:49:30 +000033
Christian Heimesdd15f6c2008-03-16 00:07:10 +000034 >>> indent_error_file = \"""
35 ... def k(x):
36 ... x += 2
37 ... x += 5
38 ... \"""
Trent Nelson428de652008-03-18 22:41:35 +000039 >>> readline = BytesIO(indent_error_file.encode('utf-8')).readline
40 >>> for tok in tokenize(readline): pass
Christian Heimesdd15f6c2008-03-16 00:07:10 +000041 Traceback (most recent call last):
42 ...
43 IndentationError: unindent does not match any outer indentation level
Thomas Wouters89f507f2006-12-13 04:49:30 +000044
Mark Dickinson3c0b3172010-06-29 07:38:37 +000045There are some standard formatting practices that are easy to get right.
Thomas Wouters89f507f2006-12-13 04:49:30 +000046
Christian Heimesdd15f6c2008-03-16 00:07:10 +000047 >>> roundtrip("if x == 1:\\n"
48 ... " print(x)\\n")
49 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000050
Christian Heimesdd15f6c2008-03-16 00:07:10 +000051 >>> roundtrip("# This is a comment\\n# This also")
52 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000053
54Some people use different formatting conventions, which makes
Christian Heimesdd15f6c2008-03-16 00:07:10 +000055untokenize a little trickier. Note that this test involves trailing
56whitespace after the colon. Note that we use hex escapes to make the
Trent Nelson428de652008-03-18 22:41:35 +000057two trailing blanks apparent in the expected output.
Thomas Wouters89f507f2006-12-13 04:49:30 +000058
Christian Heimesdd15f6c2008-03-16 00:07:10 +000059 >>> roundtrip("if x == 1 : \\n"
60 ... " print(x)\\n")
61 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000062
Benjamin Petersonee8712c2008-05-20 21:35:26 +000063 >>> f = support.findfile("tokenize_tests.txt")
Trent Nelson428de652008-03-18 22:41:35 +000064 >>> roundtrip(open(f, 'rb'))
Christian Heimesdd15f6c2008-03-16 00:07:10 +000065 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000066
Christian Heimesdd15f6c2008-03-16 00:07:10 +000067 >>> roundtrip("if x == 1:\\n"
68 ... " # A comment by itself.\\n"
69 ... " print(x) # Comment here, too.\\n"
70 ... " # Another comment.\\n"
71 ... "after_if = True\\n")
72 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000073
Christian Heimesdd15f6c2008-03-16 00:07:10 +000074 >>> roundtrip("if (x # The comments need to go in the right place\\n"
75 ... " == 1):\\n"
76 ... " print('x==1')\\n")
77 True
Thomas Wouters89f507f2006-12-13 04:49:30 +000078
Christian Heimesdd15f6c2008-03-16 00:07:10 +000079 >>> roundtrip("class Test: # A comment here\\n"
80 ... " # A comment with weird indent\\n"
81 ... " after_com = 5\\n"
82 ... " def x(m): return m*5 # a one liner\\n"
83 ... " def y(m): # A whitespace after the colon\\n"
84 ... " return y*4 # 3-space indent\\n")
85 True
86
87Some error-handling code
88
89 >>> roundtrip("try: import somemodule\\n"
90 ... "except ImportError: # comment\\n"
Christian Heimesba4af492008-03-28 00:55:15 +000091 ... " print('Can not import' # comment2\\n)"
Neal Norwitz752abd02008-05-13 04:55:24 +000092 ... "else: print('Loaded')\\n")
Christian Heimesdd15f6c2008-03-16 00:07:10 +000093 True
94
Eric Smith74ca5572008-03-17 19:49:19 +000095Balancing continuation
Christian Heimesdd15f6c2008-03-16 00:07:10 +000096
97 >>> roundtrip("a = (3,4, \\n"
98 ... "5,6)\\n"
99 ... "y = [3, 4,\\n"
100 ... "5]\\n"
101 ... "z = {'a': 5,\\n"
102 ... "'b':15, 'c':True}\\n"
103 ... "x = len(y) + 5 - a[\\n"
104 ... "3] - a[2]\\n"
105 ... "+ len(z) - z[\\n"
106 ... "'b']\\n")
107 True
108
109Ordinary integers and binary operators
110
111 >>> dump_tokens("0xff <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000112 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000113 NUMBER '0xff' (1, 0) (1, 4)
114 OP '<=' (1, 5) (1, 7)
115 NUMBER '255' (1, 8) (1, 11)
Eric Smith74ca5572008-03-17 19:49:19 +0000116 >>> dump_tokens("0b10 <= 255")
Trent Nelson428de652008-03-18 22:41:35 +0000117 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000118 NUMBER '0b10' (1, 0) (1, 4)
119 OP '<=' (1, 5) (1, 7)
120 NUMBER '255' (1, 8) (1, 11)
121 >>> dump_tokens("0o123 <= 0O123")
Trent Nelson428de652008-03-18 22:41:35 +0000122 ENCODING 'utf-8' (0, 0) (0, 0)
Eric Smith74ca5572008-03-17 19:49:19 +0000123 NUMBER '0o123' (1, 0) (1, 5)
124 OP '<=' (1, 6) (1, 8)
125 NUMBER '0O123' (1, 9) (1, 14)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000126 >>> dump_tokens("1234567 > ~0x15")
Trent Nelson428de652008-03-18 22:41:35 +0000127 ENCODING 'utf-8' (0, 0) (0, 0)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000128 NUMBER '1234567' (1, 0) (1, 7)
129 OP '>' (1, 8) (1, 9)
130 OP '~' (1, 10) (1, 11)
131 NUMBER '0x15' (1, 11) (1, 15)
132 >>> dump_tokens("2134568 != 1231515")
Trent Nelson428de652008-03-18 22:41:35 +0000133 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000134 NUMBER '2134568' (1, 0) (1, 7)
135 OP '!=' (1, 8) (1, 10)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000136 NUMBER '1231515' (1, 11) (1, 18)
137 >>> dump_tokens("(-124561-1) & 200000000")
Trent Nelson428de652008-03-18 22:41:35 +0000138 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000139 OP '(' (1, 0) (1, 1)
140 OP '-' (1, 1) (1, 2)
141 NUMBER '124561' (1, 2) (1, 8)
142 OP '-' (1, 8) (1, 9)
143 NUMBER '1' (1, 9) (1, 10)
144 OP ')' (1, 10) (1, 11)
145 OP '&' (1, 12) (1, 13)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000146 NUMBER '200000000' (1, 14) (1, 23)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000147 >>> dump_tokens("0xdeadbeef != -1")
Trent Nelson428de652008-03-18 22:41:35 +0000148 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000149 NUMBER '0xdeadbeef' (1, 0) (1, 10)
150 OP '!=' (1, 11) (1, 13)
151 OP '-' (1, 14) (1, 15)
152 NUMBER '1' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000153 >>> dump_tokens("0xdeadc0de & 12345")
Trent Nelson428de652008-03-18 22:41:35 +0000154 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000155 NUMBER '0xdeadc0de' (1, 0) (1, 10)
156 OP '&' (1, 11) (1, 12)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000157 NUMBER '12345' (1, 13) (1, 18)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000158 >>> dump_tokens("0xFF & 0x15 | 1234")
Trent Nelson428de652008-03-18 22:41:35 +0000159 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000160 NUMBER '0xFF' (1, 0) (1, 4)
161 OP '&' (1, 5) (1, 6)
162 NUMBER '0x15' (1, 7) (1, 11)
163 OP '|' (1, 12) (1, 13)
164 NUMBER '1234' (1, 14) (1, 18)
165
166Long integers
167
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000168 >>> dump_tokens("x = 0")
Trent Nelson428de652008-03-18 22:41:35 +0000169 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000170 NAME 'x' (1, 0) (1, 1)
171 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000172 NUMBER '0' (1, 4) (1, 5)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000173 >>> dump_tokens("x = 0xfffffffffff")
Trent Nelson428de652008-03-18 22:41:35 +0000174 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000175 NAME 'x' (1, 0) (1, 1)
176 OP '=' (1, 2) (1, 3)
177 NUMBER '0xffffffffff (1, 4) (1, 17)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000178 >>> dump_tokens("x = 123141242151251616110")
Trent Nelson428de652008-03-18 22:41:35 +0000179 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000180 NAME 'x' (1, 0) (1, 1)
181 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000182 NUMBER '123141242151 (1, 4) (1, 25)
183 >>> dump_tokens("x = -15921590215012591")
Trent Nelson428de652008-03-18 22:41:35 +0000184 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000185 NAME 'x' (1, 0) (1, 1)
186 OP '=' (1, 2) (1, 3)
187 OP '-' (1, 4) (1, 5)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000188 NUMBER '159215902150 (1, 5) (1, 22)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000189
190Floating point numbers
191
192 >>> dump_tokens("x = 3.14159")
Trent Nelson428de652008-03-18 22:41:35 +0000193 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000194 NAME 'x' (1, 0) (1, 1)
195 OP '=' (1, 2) (1, 3)
196 NUMBER '3.14159' (1, 4) (1, 11)
197 >>> dump_tokens("x = 314159.")
Trent Nelson428de652008-03-18 22:41:35 +0000198 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000199 NAME 'x' (1, 0) (1, 1)
200 OP '=' (1, 2) (1, 3)
201 NUMBER '314159.' (1, 4) (1, 11)
202 >>> dump_tokens("x = .314159")
Trent Nelson428de652008-03-18 22:41:35 +0000203 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000204 NAME 'x' (1, 0) (1, 1)
205 OP '=' (1, 2) (1, 3)
206 NUMBER '.314159' (1, 4) (1, 11)
207 >>> dump_tokens("x = 3e14159")
Trent Nelson428de652008-03-18 22:41:35 +0000208 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000209 NAME 'x' (1, 0) (1, 1)
210 OP '=' (1, 2) (1, 3)
211 NUMBER '3e14159' (1, 4) (1, 11)
212 >>> dump_tokens("x = 3E123")
Trent Nelson428de652008-03-18 22:41:35 +0000213 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000214 NAME 'x' (1, 0) (1, 1)
215 OP '=' (1, 2) (1, 3)
216 NUMBER '3E123' (1, 4) (1, 9)
217 >>> dump_tokens("x+y = 3e-1230")
Trent Nelson428de652008-03-18 22:41:35 +0000218 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000219 NAME 'x' (1, 0) (1, 1)
220 OP '+' (1, 1) (1, 2)
221 NAME 'y' (1, 2) (1, 3)
222 OP '=' (1, 4) (1, 5)
223 NUMBER '3e-1230' (1, 6) (1, 13)
224 >>> dump_tokens("x = 3.14e159")
Trent Nelson428de652008-03-18 22:41:35 +0000225 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000226 NAME 'x' (1, 0) (1, 1)
227 OP '=' (1, 2) (1, 3)
228 NUMBER '3.14e159' (1, 4) (1, 12)
229
230String literals
231
232 >>> dump_tokens("x = ''; y = \\\"\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000233 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000234 NAME 'x' (1, 0) (1, 1)
235 OP '=' (1, 2) (1, 3)
236 STRING "''" (1, 4) (1, 6)
237 OP ';' (1, 6) (1, 7)
238 NAME 'y' (1, 8) (1, 9)
239 OP '=' (1, 10) (1, 11)
240 STRING '""' (1, 12) (1, 14)
241 >>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000242 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000243 NAME 'x' (1, 0) (1, 1)
244 OP '=' (1, 2) (1, 3)
245 STRING '\\'"\\'' (1, 4) (1, 7)
246 OP ';' (1, 7) (1, 8)
247 NAME 'y' (1, 9) (1, 10)
248 OP '=' (1, 11) (1, 12)
249 STRING '"\\'"' (1, 13) (1, 16)
250 >>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")
Trent Nelson428de652008-03-18 22:41:35 +0000251 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000252 NAME 'x' (1, 0) (1, 1)
253 OP '=' (1, 2) (1, 3)
254 STRING '"doesn\\'t "' (1, 4) (1, 14)
255 NAME 'shrink' (1, 14) (1, 20)
256 STRING '", does it"' (1, 20) (1, 31)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000257 >>> dump_tokens("x = 'abc' + 'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000258 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000259 NAME 'x' (1, 0) (1, 1)
260 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000261 STRING "'abc'" (1, 4) (1, 9)
262 OP '+' (1, 10) (1, 11)
263 STRING "'ABC'" (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000264 >>> dump_tokens('y = "ABC" + "ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000265 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000266 NAME 'y' (1, 0) (1, 1)
267 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000268 STRING '"ABC"' (1, 4) (1, 9)
269 OP '+' (1, 10) (1, 11)
270 STRING '"ABC"' (1, 12) (1, 17)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000271 >>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")
Trent Nelson428de652008-03-18 22:41:35 +0000272 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000273 NAME 'x' (1, 0) (1, 1)
274 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000275 STRING "r'abc'" (1, 4) (1, 10)
276 OP '+' (1, 11) (1, 12)
277 STRING "r'ABC'" (1, 13) (1, 19)
278 OP '+' (1, 20) (1, 21)
279 STRING "R'ABC'" (1, 22) (1, 28)
280 OP '+' (1, 29) (1, 30)
281 STRING "R'ABC'" (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000282 >>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')
Trent Nelson428de652008-03-18 22:41:35 +0000283 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000284 NAME 'y' (1, 0) (1, 1)
285 OP '=' (1, 2) (1, 3)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000286 STRING 'r"abc"' (1, 4) (1, 10)
287 OP '+' (1, 11) (1, 12)
288 STRING 'r"ABC"' (1, 13) (1, 19)
289 OP '+' (1, 20) (1, 21)
290 STRING 'R"ABC"' (1, 22) (1, 28)
291 OP '+' (1, 29) (1, 30)
292 STRING 'R"ABC"' (1, 31) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000293
Meador Inge8d5c0b82012-06-16 21:49:08 -0500294 >>> dump_tokens("u'abc' + U'abc'")
295 ENCODING 'utf-8' (0, 0) (0, 0)
296 STRING "u'abc'" (1, 0) (1, 6)
297 OP '+' (1, 7) (1, 8)
298 STRING "U'abc'" (1, 9) (1, 15)
299 >>> dump_tokens('u"abc" + U"abc"')
300 ENCODING 'utf-8' (0, 0) (0, 0)
301 STRING 'u"abc"' (1, 0) (1, 6)
302 OP '+' (1, 7) (1, 8)
303 STRING 'U"abc"' (1, 9) (1, 15)
Meador Inge8d5c0b82012-06-16 21:49:08 -0500304
305 >>> dump_tokens("b'abc' + B'abc'")
306 ENCODING 'utf-8' (0, 0) (0, 0)
307 STRING "b'abc'" (1, 0) (1, 6)
308 OP '+' (1, 7) (1, 8)
309 STRING "B'abc'" (1, 9) (1, 15)
310 >>> dump_tokens('b"abc" + B"abc"')
311 ENCODING 'utf-8' (0, 0) (0, 0)
312 STRING 'b"abc"' (1, 0) (1, 6)
313 OP '+' (1, 7) (1, 8)
314 STRING 'B"abc"' (1, 9) (1, 15)
315 >>> dump_tokens("br'abc' + bR'abc' + Br'abc' + BR'abc'")
316 ENCODING 'utf-8' (0, 0) (0, 0)
317 STRING "br'abc'" (1, 0) (1, 7)
318 OP '+' (1, 8) (1, 9)
319 STRING "bR'abc'" (1, 10) (1, 17)
320 OP '+' (1, 18) (1, 19)
321 STRING "Br'abc'" (1, 20) (1, 27)
322 OP '+' (1, 28) (1, 29)
323 STRING "BR'abc'" (1, 30) (1, 37)
324 >>> dump_tokens('br"abc" + bR"abc" + Br"abc" + BR"abc"')
325 ENCODING 'utf-8' (0, 0) (0, 0)
326 STRING 'br"abc"' (1, 0) (1, 7)
327 OP '+' (1, 8) (1, 9)
328 STRING 'bR"abc"' (1, 10) (1, 17)
329 OP '+' (1, 18) (1, 19)
330 STRING 'Br"abc"' (1, 20) (1, 27)
331 OP '+' (1, 28) (1, 29)
332 STRING 'BR"abc"' (1, 30) (1, 37)
333 >>> dump_tokens("rb'abc' + rB'abc' + Rb'abc' + RB'abc'")
334 ENCODING 'utf-8' (0, 0) (0, 0)
335 STRING "rb'abc'" (1, 0) (1, 7)
336 OP '+' (1, 8) (1, 9)
337 STRING "rB'abc'" (1, 10) (1, 17)
338 OP '+' (1, 18) (1, 19)
339 STRING "Rb'abc'" (1, 20) (1, 27)
340 OP '+' (1, 28) (1, 29)
341 STRING "RB'abc'" (1, 30) (1, 37)
342 >>> dump_tokens('rb"abc" + rB"abc" + Rb"abc" + RB"abc"')
343 ENCODING 'utf-8' (0, 0) (0, 0)
344 STRING 'rb"abc"' (1, 0) (1, 7)
345 OP '+' (1, 8) (1, 9)
346 STRING 'rB"abc"' (1, 10) (1, 17)
347 OP '+' (1, 18) (1, 19)
348 STRING 'Rb"abc"' (1, 20) (1, 27)
349 OP '+' (1, 28) (1, 29)
350 STRING 'RB"abc"' (1, 30) (1, 37)
351
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000352Operators
353
354 >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000355 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000356 NAME 'def' (1, 0) (1, 3)
357 NAME 'd22' (1, 4) (1, 7)
358 OP '(' (1, 7) (1, 8)
359 NAME 'a' (1, 8) (1, 9)
360 OP ',' (1, 9) (1, 10)
361 NAME 'b' (1, 11) (1, 12)
362 OP ',' (1, 12) (1, 13)
363 NAME 'c' (1, 14) (1, 15)
364 OP '=' (1, 15) (1, 16)
365 NUMBER '2' (1, 16) (1, 17)
366 OP ',' (1, 17) (1, 18)
367 NAME 'd' (1, 19) (1, 20)
368 OP '=' (1, 20) (1, 21)
369 NUMBER '2' (1, 21) (1, 22)
370 OP ',' (1, 22) (1, 23)
371 OP '*' (1, 24) (1, 25)
372 NAME 'k' (1, 25) (1, 26)
373 OP ')' (1, 26) (1, 27)
374 OP ':' (1, 27) (1, 28)
375 NAME 'pass' (1, 29) (1, 33)
376 >>> dump_tokens("def d01v_(a=1, *k, **w): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000377 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000378 NAME 'def' (1, 0) (1, 3)
379 NAME 'd01v_' (1, 4) (1, 9)
380 OP '(' (1, 9) (1, 10)
381 NAME 'a' (1, 10) (1, 11)
382 OP '=' (1, 11) (1, 12)
383 NUMBER '1' (1, 12) (1, 13)
384 OP ',' (1, 13) (1, 14)
385 OP '*' (1, 15) (1, 16)
386 NAME 'k' (1, 16) (1, 17)
387 OP ',' (1, 17) (1, 18)
388 OP '**' (1, 19) (1, 21)
389 NAME 'w' (1, 21) (1, 22)
390 OP ')' (1, 22) (1, 23)
391 OP ':' (1, 23) (1, 24)
392 NAME 'pass' (1, 25) (1, 29)
393
394Comparison
395
396 >>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +
397 ... "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")
Trent Nelson428de652008-03-18 22:41:35 +0000398 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000399 NAME 'if' (1, 0) (1, 2)
400 NUMBER '1' (1, 3) (1, 4)
401 OP '<' (1, 5) (1, 6)
402 NUMBER '1' (1, 7) (1, 8)
403 OP '>' (1, 9) (1, 10)
404 NUMBER '1' (1, 11) (1, 12)
405 OP '==' (1, 13) (1, 15)
406 NUMBER '1' (1, 16) (1, 17)
407 OP '>=' (1, 18) (1, 20)
408 NUMBER '5' (1, 21) (1, 22)
409 OP '<=' (1, 23) (1, 25)
410 NUMBER '0x15' (1, 26) (1, 30)
411 OP '<=' (1, 31) (1, 33)
412 NUMBER '0x12' (1, 34) (1, 38)
413 OP '!=' (1, 39) (1, 41)
414 NUMBER '1' (1, 42) (1, 43)
415 NAME 'and' (1, 44) (1, 47)
416 NUMBER '5' (1, 48) (1, 49)
417 NAME 'in' (1, 50) (1, 52)
418 NUMBER '1' (1, 53) (1, 54)
419 NAME 'not' (1, 55) (1, 58)
420 NAME 'in' (1, 59) (1, 61)
421 NUMBER '1' (1, 62) (1, 63)
422 NAME 'is' (1, 64) (1, 66)
423 NUMBER '1' (1, 67) (1, 68)
424 NAME 'or' (1, 69) (1, 71)
425 NUMBER '5' (1, 72) (1, 73)
426 NAME 'is' (1, 74) (1, 76)
427 NAME 'not' (1, 77) (1, 80)
428 NUMBER '1' (1, 81) (1, 82)
429 OP ':' (1, 82) (1, 83)
430 NAME 'pass' (1, 84) (1, 88)
431
432Shift
433
434 >>> dump_tokens("x = 1 << 1 >> 5")
Trent Nelson428de652008-03-18 22:41:35 +0000435 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000436 NAME 'x' (1, 0) (1, 1)
437 OP '=' (1, 2) (1, 3)
438 NUMBER '1' (1, 4) (1, 5)
439 OP '<<' (1, 6) (1, 8)
440 NUMBER '1' (1, 9) (1, 10)
441 OP '>>' (1, 11) (1, 13)
442 NUMBER '5' (1, 14) (1, 15)
443
444Additive
445
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000446 >>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")
Trent Nelson428de652008-03-18 22:41:35 +0000447 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000448 NAME 'x' (1, 0) (1, 1)
449 OP '=' (1, 2) (1, 3)
450 NUMBER '1' (1, 4) (1, 5)
451 OP '-' (1, 6) (1, 7)
452 NAME 'y' (1, 8) (1, 9)
453 OP '+' (1, 10) (1, 11)
454 NUMBER '15' (1, 12) (1, 14)
455 OP '-' (1, 15) (1, 16)
Mark Dickinson0c1f7c02008-03-16 05:05:12 +0000456 NUMBER '1' (1, 17) (1, 18)
457 OP '+' (1, 19) (1, 20)
458 NUMBER '0x124' (1, 21) (1, 26)
459 OP '+' (1, 27) (1, 28)
460 NAME 'z' (1, 29) (1, 30)
461 OP '+' (1, 31) (1, 32)
462 NAME 'a' (1, 33) (1, 34)
463 OP '[' (1, 34) (1, 35)
464 NUMBER '5' (1, 35) (1, 36)
465 OP ']' (1, 36) (1, 37)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000466
467Multiplicative
468
469 >>> dump_tokens("x = 1//1*1/5*12%0x12")
Trent Nelson428de652008-03-18 22:41:35 +0000470 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000471 NAME 'x' (1, 0) (1, 1)
472 OP '=' (1, 2) (1, 3)
473 NUMBER '1' (1, 4) (1, 5)
474 OP '//' (1, 5) (1, 7)
475 NUMBER '1' (1, 7) (1, 8)
476 OP '*' (1, 8) (1, 9)
477 NUMBER '1' (1, 9) (1, 10)
478 OP '/' (1, 10) (1, 11)
479 NUMBER '5' (1, 11) (1, 12)
480 OP '*' (1, 12) (1, 13)
481 NUMBER '12' (1, 13) (1, 15)
482 OP '%' (1, 15) (1, 16)
483 NUMBER '0x12' (1, 16) (1, 20)
484
485Unary
486
487 >>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")
Trent Nelson428de652008-03-18 22:41:35 +0000488 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000489 OP '~' (1, 0) (1, 1)
490 NUMBER '1' (1, 1) (1, 2)
491 OP '^' (1, 3) (1, 4)
492 NUMBER '1' (1, 5) (1, 6)
493 OP '&' (1, 7) (1, 8)
494 NUMBER '1' (1, 9) (1, 10)
495 OP '|' (1, 11) (1, 12)
496 NUMBER '1' (1, 12) (1, 13)
497 OP '^' (1, 14) (1, 15)
498 OP '-' (1, 16) (1, 17)
499 NUMBER '1' (1, 17) (1, 18)
500 >>> dump_tokens("-1*1/1+1*1//1 - ---1**1")
Trent Nelson428de652008-03-18 22:41:35 +0000501 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000502 OP '-' (1, 0) (1, 1)
503 NUMBER '1' (1, 1) (1, 2)
504 OP '*' (1, 2) (1, 3)
505 NUMBER '1' (1, 3) (1, 4)
506 OP '/' (1, 4) (1, 5)
507 NUMBER '1' (1, 5) (1, 6)
508 OP '+' (1, 6) (1, 7)
509 NUMBER '1' (1, 7) (1, 8)
510 OP '*' (1, 8) (1, 9)
511 NUMBER '1' (1, 9) (1, 10)
512 OP '//' (1, 10) (1, 12)
513 NUMBER '1' (1, 12) (1, 13)
514 OP '-' (1, 14) (1, 15)
515 OP '-' (1, 16) (1, 17)
516 OP '-' (1, 17) (1, 18)
517 OP '-' (1, 18) (1, 19)
518 NUMBER '1' (1, 19) (1, 20)
519 OP '**' (1, 20) (1, 22)
520 NUMBER '1' (1, 22) (1, 23)
521
522Selector
523
524 >>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")
Trent Nelson428de652008-03-18 22:41:35 +0000525 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000526 NAME 'import' (1, 0) (1, 6)
527 NAME 'sys' (1, 7) (1, 10)
528 OP ',' (1, 10) (1, 11)
529 NAME 'time' (1, 12) (1, 16)
530 NEWLINE '\\n' (1, 16) (1, 17)
531 NAME 'x' (2, 0) (2, 1)
532 OP '=' (2, 2) (2, 3)
533 NAME 'sys' (2, 4) (2, 7)
534 OP '.' (2, 7) (2, 8)
535 NAME 'modules' (2, 8) (2, 15)
536 OP '[' (2, 15) (2, 16)
537 STRING "'time'" (2, 16) (2, 22)
538 OP ']' (2, 22) (2, 23)
539 OP '.' (2, 23) (2, 24)
540 NAME 'time' (2, 24) (2, 28)
541 OP '(' (2, 28) (2, 29)
542 OP ')' (2, 29) (2, 30)
543
544Methods
545
546 >>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")
Trent Nelson428de652008-03-18 22:41:35 +0000547 ENCODING 'utf-8' (0, 0) (0, 0)
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000548 OP '@' (1, 0) (1, 1)
549 NAME 'staticmethod (1, 1) (1, 13)
550 NEWLINE '\\n' (1, 13) (1, 14)
551 NAME 'def' (2, 0) (2, 3)
552 NAME 'foo' (2, 4) (2, 7)
553 OP '(' (2, 7) (2, 8)
554 NAME 'x' (2, 8) (2, 9)
555 OP ',' (2, 9) (2, 10)
556 NAME 'y' (2, 10) (2, 11)
557 OP ')' (2, 11) (2, 12)
558 OP ':' (2, 12) (2, 13)
559 NAME 'pass' (2, 14) (2, 18)
560
561Backslash means line continuation, except for comments
562
563 >>> roundtrip("x=1+\\\\n"
564 ... "1\\n"
565 ... "# This is a comment\\\\n"
566 ... "# This also\\n")
567 True
568 >>> roundtrip("# Comment \\\\nx = 0")
569 True
Christian Heimesba4af492008-03-28 00:55:15 +0000570
571Two string literals on the same line
572
573 >>> roundtrip("'' ''")
574 True
575
576Test roundtrip on random python modules.
Antoine Pitrou5bc4fa72010-10-14 15:34:31 +0000577pass the '-ucpu' option to process the full directory.
Christian Heimesba4af492008-03-28 00:55:15 +0000578
579 >>> import random
580 >>> tempdir = os.path.dirname(f) or os.curdir
581 >>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
582
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500583Tokenize is broken on test_pep3131.py because regular expressions are
584broken on the obscure unicode identifiers in it. *sigh*
585With roundtrip extended to test the 5-tuple mode of untokenize,
5867 more testfiles fail. Remove them also until the failure is diagnosed.
587
Benjamin Peterson963e4022011-08-13 00:33:21 -0500588 >>> testfiles.remove(os.path.join(tempdir, "test_pep3131.py"))
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500589 >>> for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'):
590 ... testfiles.remove(os.path.join(tempdir, "test_%s.py") % f)
591 ...
Antoine Pitrou5bc4fa72010-10-14 15:34:31 +0000592 >>> if not support.is_resource_enabled("cpu"):
Christian Heimesba4af492008-03-28 00:55:15 +0000593 ... testfiles = random.sample(testfiles, 10)
594 ...
595 >>> for testfile in testfiles:
596 ... if not roundtrip(open(testfile, 'rb')):
597 ... print("Roundtrip failed for file %s" % testfile)
598 ... break
599 ... else: True
600 True
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000601
602Evil tabs
Benjamin Peterson33856de2010-08-30 14:41:20 +0000603
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000604 >>> dump_tokens("def f():\\n\\tif x\\n \\tpass")
605 ENCODING 'utf-8' (0, 0) (0, 0)
606 NAME 'def' (1, 0) (1, 3)
607 NAME 'f' (1, 4) (1, 5)
608 OP '(' (1, 5) (1, 6)
609 OP ')' (1, 6) (1, 7)
610 OP ':' (1, 7) (1, 8)
611 NEWLINE '\\n' (1, 8) (1, 9)
612 INDENT '\\t' (2, 0) (2, 1)
613 NAME 'if' (2, 1) (2, 3)
614 NAME 'x' (2, 4) (2, 5)
615 NEWLINE '\\n' (2, 5) (2, 6)
616 INDENT ' \\t' (3, 0) (3, 9)
617 NAME 'pass' (3, 9) (3, 13)
618 DEDENT '' (4, 0) (4, 0)
619 DEDENT '' (4, 0) (4, 0)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000620
621Non-ascii identifiers
622
623 >>> dump_tokens("Örter = 'places'\\ngrün = 'green'")
624 ENCODING 'utf-8' (0, 0) (0, 0)
625 NAME 'Örter' (1, 0) (1, 5)
626 OP '=' (1, 6) (1, 7)
627 STRING "'places'" (1, 8) (1, 16)
628 NEWLINE '\\n' (1, 16) (1, 17)
629 NAME 'grün' (2, 0) (2, 4)
630 OP '=' (2, 5) (2, 6)
631 STRING "'green'" (2, 7) (2, 14)
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000632
633Legacy unicode literals:
634
Christian Heimes0b3847d2012-06-20 11:17:58 +0200635 >>> dump_tokens("Örter = u'places'\\ngrün = U'green'")
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000636 ENCODING 'utf-8' (0, 0) (0, 0)
637 NAME 'Örter' (1, 0) (1, 5)
638 OP '=' (1, 6) (1, 7)
639 STRING "u'places'" (1, 8) (1, 17)
640 NEWLINE '\\n' (1, 17) (1, 18)
641 NAME 'grün' (2, 0) (2, 4)
642 OP '=' (2, 5) (2, 6)
Christian Heimes0b3847d2012-06-20 11:17:58 +0200643 STRING "U'green'" (2, 7) (2, 15)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000644"""
645
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000646from test import support
Trent Nelson428de652008-03-18 22:41:35 +0000647from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
Meador Inge00c7f852012-01-19 00:44:45 -0600648 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
Terry Jan Reedy5e6db312014-02-17 16:45:48 -0500649 open as tokenize_open, Untokenizer)
Trent Nelson428de652008-03-18 22:41:35 +0000650from io import BytesIO
Victor Stinner387729e2015-05-26 00:43:58 +0200651from unittest import TestCase, mock
Jason R. Coombs7cf36382015-06-20 19:13:50 -0400652import os
Meador Inge00c7f852012-01-19 00:44:45 -0600653import token
Raymond Hettinger68c04532005-06-10 11:05:19 +0000654
Thomas Wouters89f507f2006-12-13 04:49:30 +0000655def dump_tokens(s):
656 """Print out the tokens in s in a table format.
657
658 The ENDMARKER is omitted.
659 """
Trent Nelson428de652008-03-18 22:41:35 +0000660 f = BytesIO(s.encode('utf-8'))
661 for type, token, start, end, line in tokenize(f.readline):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000662 if type == ENDMARKER:
663 break
664 type = tok_name[type]
Christian Heimesdd15f6c2008-03-16 00:07:10 +0000665 print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters89f507f2006-12-13 04:49:30 +0000666
Trent Nelson428de652008-03-18 22:41:35 +0000667def roundtrip(f):
668 """
669 Test roundtrip for `untokenize`. `f` is an open file or a string.
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500670 The source code in f is tokenized to both 5- and 2-tuples.
671 Both sequences are converted back to source code via
672 tokenize.untokenize(), and the latter tokenized again to 2-tuples.
673 The test fails if the 3 pair tokenizations do not match.
674
675 When untokenize bugs are fixed, untokenize with 5-tuples should
676 reproduce code that does not contain a backslash continuation
677 following spaces. A proper test should test this.
678
679 This function would be more useful for correcting bugs if it reported
680 the first point of failure, like assertEqual, rather than just
681 returning False -- or if it were only used in unittests and not
682 doctest and actually used assertEqual.
Trent Nelson428de652008-03-18 22:41:35 +0000683 """
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500684 # Get source code and original tokenizations
Trent Nelson428de652008-03-18 22:41:35 +0000685 if isinstance(f, str):
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500686 code = f.encode('utf-8')
687 else:
688 code = f.read()
Brian Curtin9f5f65c2010-10-30 21:35:28 +0000689 f.close()
Terry Jan Reedy938ba682014-02-23 18:00:31 -0500690 readline = iter(code.splitlines(keepends=True)).__next__
691 tokens5 = list(tokenize(readline))
692 tokens2 = [tok[:2] for tok in tokens5]
693 # Reproduce tokens2 from pairs
694 bytes_from2 = untokenize(tokens2)
695 readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__
696 tokens2_from2 = [tok[:2] for tok in tokenize(readline2)]
697 # Reproduce tokens2 from 5-tuples
698 bytes_from5 = untokenize(tokens5)
699 readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__
700 tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
701 # Compare 3 versions
702 return tokens2 == tokens2_from2 == tokens2_from5
Thomas Wouters89f507f2006-12-13 04:49:30 +0000703
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000704# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000705def decistmt(s):
706 """Substitute Decimals for floats in a string of statements.
707
708 >>> from decimal import Decimal
Georg Brandl88fc6642007-02-09 21:28:07 +0000709 >>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000710 >>> decistmt(s)
Georg Brandl88fc6642007-02-09 21:28:07 +0000711 "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger68c04532005-06-10 11:05:19 +0000712
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000713 The format of the exponent is inherited from the platform C library.
714 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
Mark Dickinson388122d2010-08-04 20:56:28 +0000715 we're only showing 11 digits, and the 12th isn't close to 5, the
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000716 rest of the output should be platform-independent.
717
718 >>> exec(s) #doctest: +ELLIPSIS
Mark Dickinson388122d2010-08-04 20:56:28 +0000719 -3.2171603427...e-0...7
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000720
721 Output from calculations with Decimal should be identical across all
722 platforms.
723
Raymond Hettinger68c04532005-06-10 11:05:19 +0000724 >>> exec(decistmt(s))
725 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +0000726 """
727 result = []
Trent Nelson428de652008-03-18 22:41:35 +0000728 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
Raymond Hettinger68c04532005-06-10 11:05:19 +0000729 for toknum, tokval, _, _, _ in g:
730 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
731 result.extend([
732 (NAME, 'Decimal'),
733 (OP, '('),
734 (STRING, repr(tokval)),
735 (OP, ')')
736 ])
737 else:
738 result.append((toknum, tokval))
Trent Nelson428de652008-03-18 22:41:35 +0000739 return untokenize(result).decode('utf-8')
740
741
742class TestTokenizerAdheresToPep0263(TestCase):
743 """
744 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
745 """
746
747 def _testFile(self, filename):
748 path = os.path.join(os.path.dirname(__file__), filename)
749 return roundtrip(open(path, 'rb'))
750
751 def test_utf8_coding_cookie_and_no_utf8_bom(self):
Ned Deily2ea6fcc2011-07-19 16:15:27 -0700752 f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
Trent Nelson428de652008-03-18 22:41:35 +0000753 self.assertTrue(self._testFile(f))
754
755 def test_latin1_coding_cookie_and_utf8_bom(self):
756 """
757 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
758 allowed encoding for the comment is 'utf-8'. The text file used in
759 this test starts with a BOM signature, but specifies latin1 as the
760 coding, so verify that a SyntaxError is raised, which matches the
761 behaviour of the interpreter when it encounters a similar condition.
762 """
763 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000764 self.assertRaises(SyntaxError, self._testFile, f)
Trent Nelson428de652008-03-18 22:41:35 +0000765
766 def test_no_coding_cookie_and_utf8_bom(self):
767 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
768 self.assertTrue(self._testFile(f))
769
770 def test_utf8_coding_cookie_and_utf8_bom(self):
771 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
772 self.assertTrue(self._testFile(f))
773
Florent Xicluna11f0b412012-07-07 12:13:35 +0200774 def test_bad_coding_cookie(self):
775 self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')
776 self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')
777
Trent Nelson428de652008-03-18 22:41:35 +0000778
779class Test_Tokenize(TestCase):
780
781 def test__tokenize_decodes_with_specified_encoding(self):
782 literal = '"ЉЊЈЁЂ"'
783 line = literal.encode('utf-8')
784 first = False
785 def readline():
786 nonlocal first
787 if not first:
788 first = True
789 return line
790 else:
791 return b''
792
793 # skip the initial encoding token and the end token
794 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
795 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000796 self.assertEqual(tokens, expected_tokens,
797 "bytes not decoded with encoding")
Trent Nelson428de652008-03-18 22:41:35 +0000798
799 def test__tokenize_does_not_decode_with_encoding_none(self):
800 literal = '"ЉЊЈЁЂ"'
801 first = False
802 def readline():
803 nonlocal first
804 if not first:
805 first = True
806 return literal
807 else:
808 return b''
809
810 # skip the end token
811 tokens = list(_tokenize(readline, encoding=None))[:-1]
812 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
Ezio Melottib3aedd42010-11-20 19:04:17 +0000813 self.assertEqual(tokens, expected_tokens,
814 "string not tokenized when encoding is None")
Trent Nelson428de652008-03-18 22:41:35 +0000815
816
817class TestDetectEncoding(TestCase):
818
819 def get_readline(self, lines):
820 index = 0
821 def readline():
822 nonlocal index
823 if index == len(lines):
824 raise StopIteration
825 line = lines[index]
826 index += 1
827 return line
828 return readline
829
830 def test_no_bom_no_encoding_cookie(self):
831 lines = (
832 b'# something\n',
833 b'print(something)\n',
834 b'do_something(else)\n'
835 )
836 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000837 self.assertEqual(encoding, 'utf-8')
838 self.assertEqual(consumed_lines, list(lines[:2]))
Trent Nelson428de652008-03-18 22:41:35 +0000839
840 def test_bom_no_cookie(self):
841 lines = (
842 b'\xef\xbb\xbf# something\n',
843 b'print(something)\n',
844 b'do_something(else)\n'
845 )
846 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000847 self.assertEqual(encoding, 'utf-8-sig')
848 self.assertEqual(consumed_lines,
849 [b'# something\n', b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000850
851 def test_cookie_first_line_no_bom(self):
852 lines = (
853 b'# -*- coding: latin-1 -*-\n',
854 b'print(something)\n',
855 b'do_something(else)\n'
856 )
857 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000858 self.assertEqual(encoding, 'iso-8859-1')
859 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000860
861 def test_matched_bom_and_cookie_first_line(self):
862 lines = (
863 b'\xef\xbb\xbf# coding=utf-8\n',
864 b'print(something)\n',
865 b'do_something(else)\n'
866 )
867 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000868 self.assertEqual(encoding, 'utf-8-sig')
869 self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000870
871 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
872 lines = (
873 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
874 b'print(something)\n',
875 b'do_something(else)\n'
876 )
877 readline = self.get_readline(lines)
878 self.assertRaises(SyntaxError, detect_encoding, readline)
879
880 def test_cookie_second_line_no_bom(self):
881 lines = (
882 b'#! something\n',
883 b'# vim: set fileencoding=ascii :\n',
884 b'print(something)\n',
885 b'do_something(else)\n'
886 )
887 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000888 self.assertEqual(encoding, 'ascii')
Trent Nelson428de652008-03-18 22:41:35 +0000889 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
Ezio Melottib3aedd42010-11-20 19:04:17 +0000890 self.assertEqual(consumed_lines, expected)
Trent Nelson428de652008-03-18 22:41:35 +0000891
892 def test_matched_bom_and_cookie_second_line(self):
893 lines = (
894 b'\xef\xbb\xbf#! something\n',
895 b'f# coding=utf-8\n',
896 b'print(something)\n',
897 b'do_something(else)\n'
898 )
899 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000900 self.assertEqual(encoding, 'utf-8-sig')
901 self.assertEqual(consumed_lines,
902 [b'#! something\n', b'f# coding=utf-8\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000903
904 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
905 lines = (
906 b'\xef\xbb\xbf#! something\n',
907 b'# vim: set fileencoding=ascii :\n',
908 b'print(something)\n',
909 b'do_something(else)\n'
910 )
911 readline = self.get_readline(lines)
912 self.assertRaises(SyntaxError, detect_encoding, readline)
913
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200914 def test_cookie_second_line_noncommented_first_line(self):
915 lines = (
916 b"print('\xc2\xa3')\n",
917 b'# vim: set fileencoding=iso8859-15 :\n',
918 b"print('\xe2\x82\xac')\n"
919 )
920 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
921 self.assertEqual(encoding, 'utf-8')
922 expected = [b"print('\xc2\xa3')\n"]
923 self.assertEqual(consumed_lines, expected)
924
925 def test_cookie_second_line_commented_first_line(self):
926 lines = (
927 b"#print('\xc2\xa3')\n",
928 b'# vim: set fileencoding=iso8859-15 :\n',
929 b"print('\xe2\x82\xac')\n"
930 )
931 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
932 self.assertEqual(encoding, 'iso8859-15')
933 expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
934 self.assertEqual(consumed_lines, expected)
935
936 def test_cookie_second_line_empty_first_line(self):
937 lines = (
938 b'\n',
939 b'# vim: set fileencoding=iso8859-15 :\n',
940 b"print('\xe2\x82\xac')\n"
941 )
942 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
943 self.assertEqual(encoding, 'iso8859-15')
944 expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
945 self.assertEqual(consumed_lines, expected)
946
Benjamin Petersond3afada2009-10-09 21:43:09 +0000947 def test_latin1_normalization(self):
948 # See get_normal_name() in tokenizer.c.
949 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
950 "iso-8859-1-unix", "iso-latin-1-mac")
951 for encoding in encodings:
952 for rep in ("-", "_"):
953 enc = encoding.replace("-", rep)
954 lines = (b"#!/usr/bin/python\n",
955 b"# coding: " + enc.encode("ascii") + b"\n",
956 b"print(things)\n",
957 b"do_something += 4\n")
958 rl = self.get_readline(lines)
959 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000960 self.assertEqual(found, "iso-8859-1")
Benjamin Petersond3afada2009-10-09 21:43:09 +0000961
Martin v. Löwis63674f42012-04-20 14:36:47 +0200962 def test_syntaxerror_latin1(self):
963 # Issue 14629: need to raise SyntaxError if the first
964 # line(s) have non-UTF-8 characters
965 lines = (
966 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
967 )
968 readline = self.get_readline(lines)
969 self.assertRaises(SyntaxError, detect_encoding, readline)
970
971
Benjamin Petersond3afada2009-10-09 21:43:09 +0000972 def test_utf8_normalization(self):
973 # See get_normal_name() in tokenizer.c.
974 encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
975 for encoding in encodings:
976 for rep in ("-", "_"):
977 enc = encoding.replace("-", rep)
978 lines = (b"#!/usr/bin/python\n",
979 b"# coding: " + enc.encode("ascii") + b"\n",
980 b"1 + 3\n")
981 rl = self.get_readline(lines)
982 found, consumed_lines = detect_encoding(rl)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000983 self.assertEqual(found, "utf-8")
Benjamin Petersond3afada2009-10-09 21:43:09 +0000984
Trent Nelson428de652008-03-18 22:41:35 +0000985 def test_short_files(self):
986 readline = self.get_readline((b'print(something)\n',))
987 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000988 self.assertEqual(encoding, 'utf-8')
989 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000990
991 encoding, consumed_lines = detect_encoding(self.get_readline(()))
Ezio Melottib3aedd42010-11-20 19:04:17 +0000992 self.assertEqual(encoding, 'utf-8')
993 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +0000994
995 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
996 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +0000997 self.assertEqual(encoding, 'utf-8-sig')
998 self.assertEqual(consumed_lines, [b'print(something)\n'])
Trent Nelson428de652008-03-18 22:41:35 +0000999
1000 readline = self.get_readline((b'\xef\xbb\xbf',))
1001 encoding, consumed_lines = detect_encoding(readline)
Ezio Melottib3aedd42010-11-20 19:04:17 +00001002 self.assertEqual(encoding, 'utf-8-sig')
1003 self.assertEqual(consumed_lines, [])
Trent Nelson428de652008-03-18 22:41:35 +00001004
Benjamin Peterson433f32c2008-12-12 01:25:05 +00001005 readline = self.get_readline((b'# coding: bad\n',))
1006 self.assertRaises(SyntaxError, detect_encoding, readline)
Trent Nelson428de652008-03-18 22:41:35 +00001007
Serhiy Storchakadafea852013-09-16 23:51:56 +03001008 def test_false_encoding(self):
1009 # Issue 18873: "Encoding" detected in non-comment lines
1010 readline = self.get_readline((b'print("#coding=fake")',))
1011 encoding, consumed_lines = detect_encoding(readline)
1012 self.assertEqual(encoding, 'utf-8')
1013 self.assertEqual(consumed_lines, [b'print("#coding=fake")'])
1014
Victor Stinner58c07522010-11-09 01:08:59 +00001015 def test_open(self):
1016 filename = support.TESTFN + '.py'
1017 self.addCleanup(support.unlink, filename)
1018
1019 # test coding cookie
1020 for encoding in ('iso-8859-15', 'utf-8'):
1021 with open(filename, 'w', encoding=encoding) as fp:
1022 print("# coding: %s" % encoding, file=fp)
1023 print("print('euro:\u20ac')", file=fp)
1024 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001025 self.assertEqual(fp.encoding, encoding)
1026 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001027
1028 # test BOM (no coding cookie)
1029 with open(filename, 'w', encoding='utf-8-sig') as fp:
1030 print("print('euro:\u20ac')", file=fp)
1031 with tokenize_open(filename) as fp:
Victor Stinner92665ab2010-11-09 01:11:31 +00001032 self.assertEqual(fp.encoding, 'utf-8-sig')
1033 self.assertEqual(fp.mode, 'r')
Victor Stinner58c07522010-11-09 01:08:59 +00001034
Brett Cannonc33f3f22012-04-20 13:23:54 -04001035 def test_filename_in_exception(self):
1036 # When possible, include the file name in the exception.
1037 path = 'some_file_path'
1038 lines = (
1039 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1040 )
1041 class Bunk:
1042 def __init__(self, lines, path):
1043 self.name = path
1044 self._lines = lines
1045 self._index = 0
1046
1047 def readline(self):
1048 if self._index == len(lines):
1049 raise StopIteration
1050 line = lines[self._index]
1051 self._index += 1
1052 return line
1053
1054 with self.assertRaises(SyntaxError):
1055 ins = Bunk(lines, path)
1056 # Make sure lacking a name isn't an issue.
1057 del ins.name
1058 detect_encoding(ins.readline)
1059 with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
1060 ins = Bunk(lines, path)
1061 detect_encoding(ins.readline)
1062
Victor Stinner387729e2015-05-26 00:43:58 +02001063 def test_open_error(self):
1064 # Issue #23840: open() must close the binary file on error
1065 m = BytesIO(b'#coding:xxx')
1066 with mock.patch('tokenize._builtin_open', return_value=m):
1067 self.assertRaises(SyntaxError, tokenize_open, 'foobar')
1068 self.assertTrue(m.closed)
1069
1070
Brett Cannonc33f3f22012-04-20 13:23:54 -04001071
Trent Nelson428de652008-03-18 22:41:35 +00001072class TestTokenize(TestCase):
1073
1074 def test_tokenize(self):
1075 import tokenize as tokenize_module
1076 encoding = object()
1077 encoding_used = None
1078 def mock_detect_encoding(readline):
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001079 return encoding, [b'first', b'second']
Trent Nelson428de652008-03-18 22:41:35 +00001080
1081 def mock__tokenize(readline, encoding):
1082 nonlocal encoding_used
1083 encoding_used = encoding
1084 out = []
1085 while True:
1086 next_line = readline()
1087 if next_line:
1088 out.append(next_line)
1089 continue
1090 return out
1091
1092 counter = 0
1093 def mock_readline():
1094 nonlocal counter
1095 counter += 1
1096 if counter == 5:
1097 return b''
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001098 return str(counter).encode()
Trent Nelson428de652008-03-18 22:41:35 +00001099
1100 orig_detect_encoding = tokenize_module.detect_encoding
1101 orig__tokenize = tokenize_module._tokenize
1102 tokenize_module.detect_encoding = mock_detect_encoding
1103 tokenize_module._tokenize = mock__tokenize
1104 try:
1105 results = tokenize(mock_readline)
Serhiy Storchaka74a49ac2015-03-20 16:46:19 +02001106 self.assertEqual(list(results),
1107 [b'first', b'second', b'1', b'2', b'3', b'4'])
Trent Nelson428de652008-03-18 22:41:35 +00001108 finally:
1109 tokenize_module.detect_encoding = orig_detect_encoding
1110 tokenize_module._tokenize = orig__tokenize
1111
1112 self.assertTrue(encoding_used, encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +00001113
Meador Inge00c7f852012-01-19 00:44:45 -06001114 def assertExactTypeEqual(self, opstr, *optypes):
1115 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
1116 num_optypes = len(optypes)
1117 self.assertEqual(len(tokens), 2 + num_optypes)
1118 self.assertEqual(token.tok_name[tokens[0].exact_type],
1119 token.tok_name[ENCODING])
1120 for i in range(num_optypes):
1121 self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
1122 token.tok_name[optypes[i]])
1123 self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
1124 token.tok_name[token.ENDMARKER])
1125
1126 def test_exact_type(self):
1127 self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
1128 self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
1129 self.assertExactTypeEqual(':', token.COLON)
1130 self.assertExactTypeEqual(',', token.COMMA)
1131 self.assertExactTypeEqual(';', token.SEMI)
1132 self.assertExactTypeEqual('+', token.PLUS)
1133 self.assertExactTypeEqual('-', token.MINUS)
1134 self.assertExactTypeEqual('*', token.STAR)
1135 self.assertExactTypeEqual('/', token.SLASH)
1136 self.assertExactTypeEqual('|', token.VBAR)
1137 self.assertExactTypeEqual('&', token.AMPER)
1138 self.assertExactTypeEqual('<', token.LESS)
1139 self.assertExactTypeEqual('>', token.GREATER)
1140 self.assertExactTypeEqual('=', token.EQUAL)
1141 self.assertExactTypeEqual('.', token.DOT)
1142 self.assertExactTypeEqual('%', token.PERCENT)
1143 self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
1144 self.assertExactTypeEqual('==', token.EQEQUAL)
1145 self.assertExactTypeEqual('!=', token.NOTEQUAL)
1146 self.assertExactTypeEqual('<=', token.LESSEQUAL)
1147 self.assertExactTypeEqual('>=', token.GREATEREQUAL)
1148 self.assertExactTypeEqual('~', token.TILDE)
1149 self.assertExactTypeEqual('^', token.CIRCUMFLEX)
1150 self.assertExactTypeEqual('<<', token.LEFTSHIFT)
1151 self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
1152 self.assertExactTypeEqual('**', token.DOUBLESTAR)
1153 self.assertExactTypeEqual('+=', token.PLUSEQUAL)
1154 self.assertExactTypeEqual('-=', token.MINEQUAL)
1155 self.assertExactTypeEqual('*=', token.STAREQUAL)
1156 self.assertExactTypeEqual('/=', token.SLASHEQUAL)
1157 self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
1158 self.assertExactTypeEqual('&=', token.AMPEREQUAL)
1159 self.assertExactTypeEqual('|=', token.VBAREQUAL)
1160 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1161 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1162 self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
1163 self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
1164 self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
1165 self.assertExactTypeEqual('//', token.DOUBLESLASH)
1166 self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
1167 self.assertExactTypeEqual('@', token.AT)
1168
1169 self.assertExactTypeEqual('a**2+b**2==c**2',
1170 NAME, token.DOUBLESTAR, NUMBER,
1171 token.PLUS,
1172 NAME, token.DOUBLESTAR, NUMBER,
1173 token.EQEQUAL,
1174 NAME, token.DOUBLESTAR, NUMBER)
1175 self.assertExactTypeEqual('{1, 2, 3}',
1176 token.LBRACE,
1177 token.NUMBER, token.COMMA,
1178 token.NUMBER, token.COMMA,
1179 token.NUMBER,
1180 token.RBRACE)
1181 self.assertExactTypeEqual('^(x & 0x1)',
1182 token.CIRCUMFLEX,
1183 token.LPAR,
1184 token.NAME, token.AMPER, token.NUMBER,
1185 token.RPAR)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001186
Ezio Melottifafa8b72012-11-03 17:46:51 +02001187 def test_pathological_trailing_whitespace(self):
1188 # See http://bugs.python.org/issue16152
1189 self.assertExactTypeEqual('@ ', token.AT)
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001190
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001191class UntokenizeTest(TestCase):
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001192
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001193 def test_bad_input_order(self):
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001194 # raise if previous row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001195 u = Untokenizer()
1196 u.prev_row = 2
1197 u.prev_col = 2
1198 with self.assertRaises(ValueError) as cm:
1199 u.add_whitespace((1,3))
Terry Jan Reedy58edfd92014-02-17 16:49:06 -05001200 self.assertEqual(cm.exception.args[0],
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001201 'start (1,3) precedes previous end (2,2)')
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001202 # raise if previous column in row
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001203 self.assertRaises(ValueError, u.add_whitespace, (2,1))
1204
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -05001205 def test_backslash_continuation(self):
1206 # The problem is that <whitespace>\<newline> leaves no token
1207 u = Untokenizer()
1208 u.prev_row = 1
1209 u.prev_col = 1
1210 u.tokens = []
1211 u.add_whitespace((2, 0))
1212 self.assertEqual(u.tokens, ['\\\n'])
1213 u.prev_row = 2
1214 u.add_whitespace((4, 4))
1215 self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' '])
1216 self.assertTrue(roundtrip('a\n b\n c\n \\\n c\n'))
1217
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -05001218 def test_iter_compat(self):
1219 u = Untokenizer()
1220 token = (NAME, 'Hello')
1221 tokens = [(ENCODING, 'utf-8'), token]
1222 u.compat(token, iter([]))
1223 self.assertEqual(u.tokens, ["Hello "])
1224 u = Untokenizer()
1225 self.assertEqual(u.untokenize(iter([token])), 'Hello ')
1226 u = Untokenizer()
1227 self.assertEqual(u.untokenize(iter(tokens)), 'Hello ')
1228 self.assertEqual(u.encoding, 'utf-8')
1229 self.assertEqual(untokenize(iter(tokens)), b'Hello ')
1230
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001231
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001232class TestRoundtrip(TestCase):
1233 def roundtrip(self, code):
1234 if isinstance(code, str):
1235 code = code.encode('utf-8')
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001236 return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001237
1238 def test_indentation_semantics_retained(self):
1239 """
1240 Ensure that although whitespace might be mutated in a roundtrip,
1241 the semantic meaning of the indentation remains consistent.
1242 """
1243 code = "if False:\n\tx=3\n\tx=3\n"
Jason R. Coombsb6d1cdd2015-06-25 22:42:24 -04001244 codelines = self.roundtrip(code).split('\n')
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001245 self.assertEqual(codelines[1], codelines[2])
1246
1247
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001248__test__ = {"doctests" : doctests, 'decistmt': decistmt}
1249
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001250def test_main():
Christian Heimesdd15f6c2008-03-16 00:07:10 +00001251 from test import test_tokenize
Benjamin Petersonee8712c2008-05-20 21:35:26 +00001252 support.run_doctest(test_tokenize, True)
1253 support.run_unittest(TestTokenizerAdheresToPep0263)
1254 support.run_unittest(Test_Tokenize)
1255 support.run_unittest(TestDetectEncoding)
1256 support.run_unittest(TestTokenize)
Terry Jan Reedy5e6db312014-02-17 16:45:48 -05001257 support.run_unittest(UntokenizeTest)
Jason R. Coombs5713b3c2015-06-20 19:52:22 -04001258 support.run_unittest(TestRoundtrip)
Neal Norwitzc1505362006-12-28 06:47:50 +00001259
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001260if __name__ == "__main__":
1261 test_main()