blob: e59d9c672cff69086b62c0edc9c4eb3cef48a24d [file] [log] [blame]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001"""Tests for the tokenize module.
2
3The tests were originally written in the old Python style, where the
4test output was compared to a golden file. This docstring represents
5the first steps towards rewriting the entire test as a doctest.
6
7The tests can be really simple. Given a small fragment of source
8code, print out a table with the tokens. The ENDMARK is omitted for
9brevity.
10
11>>> dump_tokens("1 + 1")
12NUMBER '1' (1, 0) (1, 1)
13OP '+' (1, 2) (1, 3)
14NUMBER '1' (1, 4) (1, 5)
15
16A comment generates a token here, unlike in the parser module. The
17comment token is followed by an NL or a NEWLINE token, depending on
18whether the line contains the completion of a statement.
19
20>>> dump_tokens("if False:\\n"
21... " # NL\\n"
Guido van Rossume7ba4952007-06-06 23:52:48 +000022... " a = False # NEWLINE\\n")
Thomas Wouters89f507f2006-12-13 04:49:30 +000023NAME 'if' (1, 0) (1, 2)
24NAME 'False' (1, 3) (1, 8)
25OP ':' (1, 8) (1, 9)
26NEWLINE '\\n' (1, 9) (1, 10)
27COMMENT '# NL' (2, 4) (2, 8)
28NL '\\n' (2, 8) (2, 9)
29INDENT ' ' (3, 0) (3, 4)
Guido van Rossume7ba4952007-06-06 23:52:48 +000030NAME 'a' (3, 4) (3, 5)
Thomas Wouters89f507f2006-12-13 04:49:30 +000031OP '=' (3, 9) (3, 10)
32NAME 'False' (3, 11) (3, 16)
33COMMENT '# NEWLINE' (3, 17) (3, 26)
34NEWLINE '\\n' (3, 26) (3, 27)
35DEDENT '' (4, 0) (4, 0)
36
Guido van Rossuma6bcefc2007-08-01 18:06:13 +000037' # Emacs hint
Thomas Wouters89f507f2006-12-13 04:49:30 +000038
39There will be a bunch more tests of specific source patterns.
40
41The tokenize module also defines an untokenize function that should
42regenerate the original program text from the tokens.
43
44There are some standard formatting practices that are easy to get right.
45
46>>> roundtrip("if x == 1:\\n"
Georg Brandl88fc6642007-02-09 21:28:07 +000047... " print(x)\\n")
Thomas Wouters89f507f2006-12-13 04:49:30 +000048if x == 1:
Georg Brandl88fc6642007-02-09 21:28:07 +000049 print(x)
Thomas Wouters89f507f2006-12-13 04:49:30 +000050
51Some people use different formatting conventions, which makes
52untokenize a little trickier. Note that this test involves trailing
53whitespace after the colon. Note that we use hex escapes to make the
54two trailing blanks apparent in the expected output.
55
56>>> roundtrip("if x == 1 : \\n"
Georg Brandl88fc6642007-02-09 21:28:07 +000057... " print(x)\\n")
Thomas Wouters89f507f2006-12-13 04:49:30 +000058if x == 1 :\x20\x20
Georg Brandl88fc6642007-02-09 21:28:07 +000059 print(x)
Thomas Wouters89f507f2006-12-13 04:49:30 +000060
61Comments need to go in the right place.
62
63>>> roundtrip("if x == 1:\\n"
64... " # A comment by itself.\\n"
Georg Brandl88fc6642007-02-09 21:28:07 +000065... " print(x) # Comment here, too.\\n"
Thomas Wouters89f507f2006-12-13 04:49:30 +000066... " # Another comment.\\n"
67... "after_if = True\\n")
68if x == 1:
69 # A comment by itself.
Georg Brandl88fc6642007-02-09 21:28:07 +000070 print(x) # Comment here, too.
Thomas Wouters89f507f2006-12-13 04:49:30 +000071 # Another comment.
72after_if = True
73
74>>> roundtrip("if (x # The comments need to go in the right place\\n"
75... " == 1):\\n"
Georg Brandl88fc6642007-02-09 21:28:07 +000076... " print('x == 1')\\n")
Thomas Wouters89f507f2006-12-13 04:49:30 +000077if (x # The comments need to go in the right place
78 == 1):
Georg Brandl88fc6642007-02-09 21:28:07 +000079 print('x == 1')
Thomas Wouters89f507f2006-12-13 04:49:30 +000080
81"""
82
Guido van Rossumcfbbf482007-08-04 17:43:15 +000083# ' Emacs hint
84
Thomas Wouters89f507f2006-12-13 04:49:30 +000085import os, glob, random, time, sys
Guido van Rossumcfbbf482007-08-04 17:43:15 +000086import re
Guido van Rossumc43e79f2007-06-18 18:26:36 +000087from io import StringIO
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088from test.test_support import (verbose, findfile, is_resource_enabled,
89 TestFailed)
Thomas Wouters89f507f2006-12-13 04:49:30 +000090from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
91 ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)
92
93# How much time in seconds can pass before we print a 'Still working' message.
94_PRINT_WORKING_MSG_INTERVAL = 5 * 60
Guido van Rossum0874f7f1997-10-27 22:15:06 +000095
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000096# Test roundtrip for `untokenize`. `f` is a file path. The source code in f
97# is tokenized, converted back to source code via tokenize.untokenize(),
98# and tokenized again from the latter. The test fails if the second
99# tokenization doesn't match the first.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000100def test_roundtrip(f):
Guido van Rossumf066c1b2007-08-04 17:55:43 +0000101 ## print('Testing:', f)
Guido van Rossumcfbbf482007-08-04 17:43:15 +0000102 # Get the encoding first
103 fobj = open(f, encoding="latin-1")
104 first2lines = fobj.readline() + fobj.readline()
105 fobj.close()
106 m = re.search(r"coding:\s*(\S+)", first2lines)
107 if m:
108 encoding = m.group(1)
Guido van Rossumf066c1b2007-08-04 17:55:43 +0000109 ## print(" coding:", encoding)
Guido van Rossumcfbbf482007-08-04 17:43:15 +0000110 else:
111 encoding = "utf-8"
112 fobj = open(f, encoding=encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000113 try:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000114 fulltok = list(generate_tokens(fobj.readline))
Raymond Hettinger68c04532005-06-10 11:05:19 +0000115 finally:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000116 fobj.close()
Raymond Hettinger68c04532005-06-10 11:05:19 +0000117
118 t1 = [tok[:2] for tok in fulltok]
119 newtext = untokenize(t1)
Georg Brandla18af4e2007-04-21 15:47:16 +0000120 readline = iter(newtext.splitlines(1)).__next__
Raymond Hettinger68c04532005-06-10 11:05:19 +0000121 t2 = [tok[:2] for tok in generate_tokens(readline)]
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000122 if t1 != t2:
123 raise TestFailed("untokenize() roundtrip failed for %r" % f)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000124
Thomas Wouters89f507f2006-12-13 04:49:30 +0000125def dump_tokens(s):
126 """Print out the tokens in s in a table format.
127
128 The ENDMARKER is omitted.
129 """
130 f = StringIO(s)
131 for type, token, start, end, line in generate_tokens(f.readline):
132 if type == ENDMARKER:
133 break
134 type = tok_name[type]
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000135 print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters89f507f2006-12-13 04:49:30 +0000136
137def roundtrip(s):
138 f = StringIO(s)
139 source = untokenize(generate_tokens(f.readline))
Guido van Rossum0bcbb0d2007-02-09 22:43:10 +0000140 print(source, end="")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000141
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000142# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000143def decistmt(s):
144 """Substitute Decimals for floats in a string of statements.
145
146 >>> from decimal import Decimal
Georg Brandl88fc6642007-02-09 21:28:07 +0000147 >>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000148 >>> decistmt(s)
Georg Brandl88fc6642007-02-09 21:28:07 +0000149 "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger68c04532005-06-10 11:05:19 +0000150
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000151 The format of the exponent is inherited from the platform C library.
152 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
153 we're only showing 12 digits, and the 13th isn't close to 5, the
154 rest of the output should be platform-independent.
155
156 >>> exec(s) #doctest: +ELLIPSIS
157 -3.21716034272e-0...7
158
159 Output from calculations with Decimal should be identical across all
160 platforms.
161
Raymond Hettinger68c04532005-06-10 11:05:19 +0000162 >>> exec(decistmt(s))
163 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +0000164 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000165
Raymond Hettinger68c04532005-06-10 11:05:19 +0000166 result = []
167 g = generate_tokens(StringIO(s).readline) # tokenize the string
168 for toknum, tokval, _, _, _ in g:
169 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
170 result.extend([
171 (NAME, 'Decimal'),
172 (OP, '('),
173 (STRING, repr(tokval)),
174 (OP, ')')
175 ])
176 else:
177 result.append((toknum, tokval))
178 return untokenize(result)
179
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000180def test_main():
181 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000182 print('starting...')
Raymond Hettinger68c04532005-06-10 11:05:19 +0000183
Thomas Wouters89f507f2006-12-13 04:49:30 +0000184 next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
185
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000186 # Validate the tokenize_tests.txt file.
187 # This makes sure it compiles, and displays any errors in it.
188 f = open(findfile('tokenize_tests.txt'))
189 sf = f.read()
190 f.close()
191 cf = compile(sf, 'tokenize_tests.txt', 'exec')
192
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000193 # This displays the tokenization of tokenize_tests.py to stdout, and
194 # regrtest.py checks that this equals the expected output (in the
195 # test/output/ directory).
Skip Montanaro7a98be22007-08-16 14:35:24 +0000196 f = open(findfile('tokenize_tests.txt'))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000197 tokenize(f.readline)
198 f.close()
199
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000200 # Now run test_roundtrip() over test_tokenize.py too, and over all
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000201 # (if the "compiler" resource is enabled) or a small random sample (if
202 # "compiler" is not enabled) of the test*.py files.
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000203 f = findfile('test_tokenize.py')
204 if verbose:
205 print(' round trip: ', f, file=sys.__stdout__)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206 test_roundtrip(f)
207
208 testdir = os.path.dirname(f) or os.curdir
209 testfiles = glob.glob(testdir + os.sep + 'test*.py')
210 if not is_resource_enabled('compiler'):
211 testfiles = random.sample(testfiles, 10)
212
213 for f in testfiles:
Thomas Wouters89f507f2006-12-13 04:49:30 +0000214 # Print still working message since this test can be really slow
Guido van Rossumc43e79f2007-06-18 18:26:36 +0000215 if verbose:
216 print(' round trip: ', f, file=sys.__stdout__)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000217 if next_time <= time.time():
218 next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000219 print(' test_main still working, be patient...', file=sys.__stdout__)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000220 sys.__stdout__.flush()
221
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000222 test_roundtrip(f)
223
224 # Test detecton of IndentationError.
225 sampleBadText = """\
226def foo():
227 bar
228 baz
229"""
230
231 try:
232 for tok in generate_tokens(StringIO(sampleBadText).readline):
233 pass
234 except IndentationError:
235 pass
236 else:
237 raise TestFailed("Did not detect IndentationError:")
238
239 # Run the doctests in this module.
240 from test import test_tokenize # i.e., this module
241 from test.test_support import run_doctest
Thomas Wouters89f507f2006-12-13 04:49:30 +0000242 run_doctest(test_tokenize, verbose)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000243
244 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000245 print('finished')
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000246
Neal Norwitzc1505362006-12-28 06:47:50 +0000247def test_rarrow():
248 """
249 This function exists solely to test the tokenization of the RARROW
250 operator.
251
Georg Brandla18af4e2007-04-21 15:47:16 +0000252 >>> tokenize(iter(['->']).__next__) #doctest: +NORMALIZE_WHITESPACE
Neal Norwitzc1505362006-12-28 06:47:50 +0000253 1,0-1,2:\tOP\t'->'
254 2,0-2,0:\tENDMARKER\t''
255 """
256
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000257if __name__ == "__main__":
258 test_main()