blob: 022b65821ee912110de635c09acb05c785a94a66 [file] [log] [blame]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001"""Tests for the tokenize module.
2
3The tests were originally written in the old Python style, where the
4test output was compared to a golden file. This docstring represents
5the first steps towards rewriting the entire test as a doctest.
6
7The tests can be really simple. Given a small fragment of source
8code, print out a table with the tokens. The ENDMARK is omitted for
9brevity.
10
11>>> dump_tokens("1 + 1")
12NUMBER '1' (1, 0) (1, 1)
13OP '+' (1, 2) (1, 3)
14NUMBER '1' (1, 4) (1, 5)
15
16A comment generates a token here, unlike in the parser module. The
17comment token is followed by an NL or a NEWLINE token, depending on
18whether the line contains the completion of a statement.
19
20>>> dump_tokens("if False:\\n"
21... " # NL\\n"
Guido van Rossume7ba4952007-06-06 23:52:48 +000022... " a = False # NEWLINE\\n")
Thomas Wouters89f507f2006-12-13 04:49:30 +000023NAME 'if' (1, 0) (1, 2)
24NAME 'False' (1, 3) (1, 8)
25OP ':' (1, 8) (1, 9)
26NEWLINE '\\n' (1, 9) (1, 10)
27COMMENT '# NL' (2, 4) (2, 8)
28NL '\\n' (2, 8) (2, 9)
29INDENT ' ' (3, 0) (3, 4)
Guido van Rossume7ba4952007-06-06 23:52:48 +000030NAME 'a' (3, 4) (3, 5)
Thomas Wouters89f507f2006-12-13 04:49:30 +000031OP '=' (3, 9) (3, 10)
32NAME 'False' (3, 11) (3, 16)
33COMMENT '# NEWLINE' (3, 17) (3, 26)
34NEWLINE '\\n' (3, 26) (3, 27)
35DEDENT '' (4, 0) (4, 0)
36
Guido van Rossuma6bcefc2007-08-01 18:06:13 +000037' # Emacs hint
Thomas Wouters89f507f2006-12-13 04:49:30 +000038
39There will be a bunch more tests of specific source patterns.
40
41The tokenize module also defines an untokenize function that should
42regenerate the original program text from the tokens.
43
44There are some standard formatting practices that are easy to get right.
45
46>>> roundtrip("if x == 1:\\n"
Georg Brandl88fc6642007-02-09 21:28:07 +000047... " print(x)\\n")
Thomas Wouters89f507f2006-12-13 04:49:30 +000048if x == 1:
Georg Brandl88fc6642007-02-09 21:28:07 +000049 print(x)
Thomas Wouters89f507f2006-12-13 04:49:30 +000050
51Some people use different formatting conventions, which makes
52untokenize a little trickier. Note that this test involves trailing
53whitespace after the colon. Note that we use hex escapes to make the
54two trailing blanks apparent in the expected output.
55
56>>> roundtrip("if x == 1 : \\n"
Georg Brandl88fc6642007-02-09 21:28:07 +000057... " print(x)\\n")
Thomas Wouters89f507f2006-12-13 04:49:30 +000058if x == 1 :\x20\x20
Georg Brandl88fc6642007-02-09 21:28:07 +000059 print(x)
Thomas Wouters89f507f2006-12-13 04:49:30 +000060
61Comments need to go in the right place.
62
63>>> roundtrip("if x == 1:\\n"
64... " # A comment by itself.\\n"
Georg Brandl88fc6642007-02-09 21:28:07 +000065... " print(x) # Comment here, too.\\n"
Thomas Wouters89f507f2006-12-13 04:49:30 +000066... " # Another comment.\\n"
67... "after_if = True\\n")
68if x == 1:
69 # A comment by itself.
Georg Brandl88fc6642007-02-09 21:28:07 +000070 print(x) # Comment here, too.
Thomas Wouters89f507f2006-12-13 04:49:30 +000071 # Another comment.
72after_if = True
73
74>>> roundtrip("if (x # The comments need to go in the right place\\n"
75... " == 1):\\n"
Georg Brandl88fc6642007-02-09 21:28:07 +000076... " print('x == 1')\\n")
Thomas Wouters89f507f2006-12-13 04:49:30 +000077if (x # The comments need to go in the right place
78 == 1):
Georg Brandl88fc6642007-02-09 21:28:07 +000079 print('x == 1')
Thomas Wouters89f507f2006-12-13 04:49:30 +000080
81"""
82
83import os, glob, random, time, sys
Guido van Rossumc43e79f2007-06-18 18:26:36 +000084from io import StringIO
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000085from test.test_support import (verbose, findfile, is_resource_enabled,
86 TestFailed)
Thomas Wouters89f507f2006-12-13 04:49:30 +000087from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
88 ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)
89
90# How much time in seconds can pass before we print a 'Still working' message.
91_PRINT_WORKING_MSG_INTERVAL = 5 * 60
Guido van Rossum0874f7f1997-10-27 22:15:06 +000092
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000093# Test roundtrip for `untokenize`. `f` is a file path. The source code in f
94# is tokenized, converted back to source code via tokenize.untokenize(),
95# and tokenized again from the latter. The test fails if the second
96# tokenization doesn't match the first.
Raymond Hettinger68c04532005-06-10 11:05:19 +000097def test_roundtrip(f):
98 ## print 'Testing:', f
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000099 fobj = open(f)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000100 try:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000101 fulltok = list(generate_tokens(fobj.readline))
Raymond Hettinger68c04532005-06-10 11:05:19 +0000102 finally:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000103 fobj.close()
Raymond Hettinger68c04532005-06-10 11:05:19 +0000104
105 t1 = [tok[:2] for tok in fulltok]
106 newtext = untokenize(t1)
Georg Brandla18af4e2007-04-21 15:47:16 +0000107 readline = iter(newtext.splitlines(1)).__next__
Raymond Hettinger68c04532005-06-10 11:05:19 +0000108 t2 = [tok[:2] for tok in generate_tokens(readline)]
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000109 if t1 != t2:
110 raise TestFailed("untokenize() roundtrip failed for %r" % f)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000111
Thomas Wouters89f507f2006-12-13 04:49:30 +0000112def dump_tokens(s):
113 """Print out the tokens in s in a table format.
114
115 The ENDMARKER is omitted.
116 """
117 f = StringIO(s)
118 for type, token, start, end, line in generate_tokens(f.readline):
119 if type == ENDMARKER:
120 break
121 type = tok_name[type]
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000122 print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters89f507f2006-12-13 04:49:30 +0000123
124def roundtrip(s):
125 f = StringIO(s)
126 source = untokenize(generate_tokens(f.readline))
Guido van Rossum0bcbb0d2007-02-09 22:43:10 +0000127 print(source, end="")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000128
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000129# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000130def decistmt(s):
131 """Substitute Decimals for floats in a string of statements.
132
133 >>> from decimal import Decimal
Georg Brandl88fc6642007-02-09 21:28:07 +0000134 >>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000135 >>> decistmt(s)
Georg Brandl88fc6642007-02-09 21:28:07 +0000136 "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger68c04532005-06-10 11:05:19 +0000137
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000138 The format of the exponent is inherited from the platform C library.
139 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
140 we're only showing 12 digits, and the 13th isn't close to 5, the
141 rest of the output should be platform-independent.
142
143 >>> exec(s) #doctest: +ELLIPSIS
144 -3.21716034272e-0...7
145
146 Output from calculations with Decimal should be identical across all
147 platforms.
148
Raymond Hettinger68c04532005-06-10 11:05:19 +0000149 >>> exec(decistmt(s))
150 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +0000151 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000152
Raymond Hettinger68c04532005-06-10 11:05:19 +0000153 result = []
154 g = generate_tokens(StringIO(s).readline) # tokenize the string
155 for toknum, tokval, _, _, _ in g:
156 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
157 result.extend([
158 (NAME, 'Decimal'),
159 (OP, '('),
160 (STRING, repr(tokval)),
161 (OP, ')')
162 ])
163 else:
164 result.append((toknum, tokval))
165 return untokenize(result)
166
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000167def test_main():
168 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000169 print('starting...')
Raymond Hettinger68c04532005-06-10 11:05:19 +0000170
Thomas Wouters89f507f2006-12-13 04:49:30 +0000171 next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
172
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000173 # This displays the tokenization of tokenize_tests.py to stdout, and
174 # regrtest.py checks that this equals the expected output (in the
175 # test/output/ directory).
176 f = open(findfile('tokenize_tests' + os.extsep + 'txt'))
177 tokenize(f.readline)
178 f.close()
179
180 # Now run test_roundtrip() over tokenize_test.py too, and over all
181 # (if the "compiler" resource is enabled) or a small random sample (if
182 # "compiler" is not enabled) of the test*.py files.
183 f = findfile('tokenize_tests' + os.extsep + 'txt')
184 test_roundtrip(f)
185
186 testdir = os.path.dirname(f) or os.curdir
187 testfiles = glob.glob(testdir + os.sep + 'test*.py')
Guido van Rossuma6bcefc2007-08-01 18:06:13 +0000188 # Exclude test_pep263 which is encoded in KOI8-R
189 testfiles = [t for t in testfiles if not t.endswith("pep263.py")]
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000190 if not is_resource_enabled('compiler'):
191 testfiles = random.sample(testfiles, 10)
192
193 for f in testfiles:
Thomas Wouters89f507f2006-12-13 04:49:30 +0000194 # Print still working message since this test can be really slow
Guido van Rossumc43e79f2007-06-18 18:26:36 +0000195 if verbose:
196 print(' round trip: ', f, file=sys.__stdout__)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000197 if next_time <= time.time():
198 next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000199 print(' test_main still working, be patient...', file=sys.__stdout__)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000200 sys.__stdout__.flush()
201
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000202 test_roundtrip(f)
203
204 # Test detecton of IndentationError.
205 sampleBadText = """\
206def foo():
207 bar
208 baz
209"""
210
211 try:
212 for tok in generate_tokens(StringIO(sampleBadText).readline):
213 pass
214 except IndentationError:
215 pass
216 else:
217 raise TestFailed("Did not detect IndentationError:")
218
219 # Run the doctests in this module.
220 from test import test_tokenize # i.e., this module
221 from test.test_support import run_doctest
Thomas Wouters89f507f2006-12-13 04:49:30 +0000222 run_doctest(test_tokenize, verbose)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000223
224 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000225 print('finished')
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000226
Neal Norwitzc1505362006-12-28 06:47:50 +0000227def test_rarrow():
228 """
229 This function exists solely to test the tokenization of the RARROW
230 operator.
231
Georg Brandla18af4e2007-04-21 15:47:16 +0000232 >>> tokenize(iter(['->']).__next__) #doctest: +NORMALIZE_WHITESPACE
Neal Norwitzc1505362006-12-28 06:47:50 +0000233 1,0-1,2:\tOP\t'->'
234 2,0-2,0:\tENDMARKER\t''
235 """
236
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000237if __name__ == "__main__":
238 test_main()