blob: 4d8d9f14da821400bc1972c7a58d615e997486d1 [file] [log] [blame]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001"""Tests for the tokenize module.
2
3The tests were originally written in the old Python style, where the
4test output was compared to a golden file. This docstring represents
5the first steps towards rewriting the entire test as a doctest.
6
7The tests can be really simple. Given a small fragment of source
8code, print out a table with the tokens. The ENDMARK is omitted for
9brevity.
10
11>>> dump_tokens("1 + 1")
12NUMBER '1' (1, 0) (1, 1)
13OP '+' (1, 2) (1, 3)
14NUMBER '1' (1, 4) (1, 5)
15
16A comment generates a token here, unlike in the parser module. The
17comment token is followed by an NL or a NEWLINE token, depending on
18whether the line contains the completion of a statement.
19
20>>> dump_tokens("if False:\\n"
21... " # NL\\n"
Guido van Rossume7ba4952007-06-06 23:52:48 +000022... " a = False # NEWLINE\\n")
Thomas Wouters89f507f2006-12-13 04:49:30 +000023NAME 'if' (1, 0) (1, 2)
24NAME 'False' (1, 3) (1, 8)
25OP ':' (1, 8) (1, 9)
26NEWLINE '\\n' (1, 9) (1, 10)
27COMMENT '# NL' (2, 4) (2, 8)
28NL '\\n' (2, 8) (2, 9)
29INDENT ' ' (3, 0) (3, 4)
Guido van Rossume7ba4952007-06-06 23:52:48 +000030NAME 'a' (3, 4) (3, 5)
Thomas Wouters89f507f2006-12-13 04:49:30 +000031OP '=' (3, 9) (3, 10)
32NAME 'False' (3, 11) (3, 16)
33COMMENT '# NEWLINE' (3, 17) (3, 26)
34NEWLINE '\\n' (3, 26) (3, 27)
35DEDENT '' (4, 0) (4, 0)
36
37
38There will be a bunch more tests of specific source patterns.
39
40The tokenize module also defines an untokenize function that should
41regenerate the original program text from the tokens.
42
43There are some standard formatting practices that are easy to get right.
44
45>>> roundtrip("if x == 1:\\n"
Georg Brandl88fc6642007-02-09 21:28:07 +000046... " print(x)\\n")
Thomas Wouters89f507f2006-12-13 04:49:30 +000047if x == 1:
Georg Brandl88fc6642007-02-09 21:28:07 +000048 print(x)
Thomas Wouters89f507f2006-12-13 04:49:30 +000049
50Some people use different formatting conventions, which makes
51untokenize a little trickier. Note that this test involves trailing
52whitespace after the colon. Note that we use hex escapes to make the
53two trailing blanks apparent in the expected output.
54
55>>> roundtrip("if x == 1 : \\n"
Georg Brandl88fc6642007-02-09 21:28:07 +000056... " print(x)\\n")
Thomas Wouters89f507f2006-12-13 04:49:30 +000057if x == 1 :\x20\x20
Georg Brandl88fc6642007-02-09 21:28:07 +000058 print(x)
Thomas Wouters89f507f2006-12-13 04:49:30 +000059
60Comments need to go in the right place.
61
62>>> roundtrip("if x == 1:\\n"
63... " # A comment by itself.\\n"
Georg Brandl88fc6642007-02-09 21:28:07 +000064... " print(x) # Comment here, too.\\n"
Thomas Wouters89f507f2006-12-13 04:49:30 +000065... " # Another comment.\\n"
66... "after_if = True\\n")
67if x == 1:
68 # A comment by itself.
Georg Brandl88fc6642007-02-09 21:28:07 +000069 print(x) # Comment here, too.
Thomas Wouters89f507f2006-12-13 04:49:30 +000070 # Another comment.
71after_if = True
72
73>>> roundtrip("if (x # The comments need to go in the right place\\n"
74... " == 1):\\n"
Georg Brandl88fc6642007-02-09 21:28:07 +000075... " print('x == 1')\\n")
Thomas Wouters89f507f2006-12-13 04:49:30 +000076if (x # The comments need to go in the right place
77 == 1):
Georg Brandl88fc6642007-02-09 21:28:07 +000078 print('x == 1')
Thomas Wouters89f507f2006-12-13 04:49:30 +000079
80"""
81
82import os, glob, random, time, sys
Guido van Rossumc43e79f2007-06-18 18:26:36 +000083from io import StringIO
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000084from test.test_support import (verbose, findfile, is_resource_enabled,
85 TestFailed)
Thomas Wouters89f507f2006-12-13 04:49:30 +000086from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
87 ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)
88
89# How much time in seconds can pass before we print a 'Still working' message.
90_PRINT_WORKING_MSG_INTERVAL = 5 * 60
Guido van Rossum0874f7f1997-10-27 22:15:06 +000091
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000092# Test roundtrip for `untokenize`. `f` is a file path. The source code in f
93# is tokenized, converted back to source code via tokenize.untokenize(),
94# and tokenized again from the latter. The test fails if the second
95# tokenization doesn't match the first.
Raymond Hettinger68c04532005-06-10 11:05:19 +000096def test_roundtrip(f):
97 ## print 'Testing:', f
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000098 fobj = open(f)
Raymond Hettinger68c04532005-06-10 11:05:19 +000099 try:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000100 fulltok = list(generate_tokens(fobj.readline))
Raymond Hettinger68c04532005-06-10 11:05:19 +0000101 finally:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000102 fobj.close()
Raymond Hettinger68c04532005-06-10 11:05:19 +0000103
104 t1 = [tok[:2] for tok in fulltok]
105 newtext = untokenize(t1)
Georg Brandla18af4e2007-04-21 15:47:16 +0000106 readline = iter(newtext.splitlines(1)).__next__
Raymond Hettinger68c04532005-06-10 11:05:19 +0000107 t2 = [tok[:2] for tok in generate_tokens(readline)]
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000108 if t1 != t2:
109 raise TestFailed("untokenize() roundtrip failed for %r" % f)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000110
Thomas Wouters89f507f2006-12-13 04:49:30 +0000111def dump_tokens(s):
112 """Print out the tokens in s in a table format.
113
114 The ENDMARKER is omitted.
115 """
116 f = StringIO(s)
117 for type, token, start, end, line in generate_tokens(f.readline):
118 if type == ENDMARKER:
119 break
120 type = tok_name[type]
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000121 print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters89f507f2006-12-13 04:49:30 +0000122
123def roundtrip(s):
124 f = StringIO(s)
125 source = untokenize(generate_tokens(f.readline))
Guido van Rossum0bcbb0d2007-02-09 22:43:10 +0000126 print(source, end="")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000127
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000128# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000129def decistmt(s):
130 """Substitute Decimals for floats in a string of statements.
131
132 >>> from decimal import Decimal
Georg Brandl88fc6642007-02-09 21:28:07 +0000133 >>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000134 >>> decistmt(s)
Georg Brandl88fc6642007-02-09 21:28:07 +0000135 "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger68c04532005-06-10 11:05:19 +0000136
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000137 The format of the exponent is inherited from the platform C library.
138 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
139 we're only showing 12 digits, and the 13th isn't close to 5, the
140 rest of the output should be platform-independent.
141
142 >>> exec(s) #doctest: +ELLIPSIS
143 -3.21716034272e-0...7
144
145 Output from calculations with Decimal should be identical across all
146 platforms.
147
Raymond Hettinger68c04532005-06-10 11:05:19 +0000148 >>> exec(decistmt(s))
149 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +0000150 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000151
Raymond Hettinger68c04532005-06-10 11:05:19 +0000152 result = []
153 g = generate_tokens(StringIO(s).readline) # tokenize the string
154 for toknum, tokval, _, _, _ in g:
155 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
156 result.extend([
157 (NAME, 'Decimal'),
158 (OP, '('),
159 (STRING, repr(tokval)),
160 (OP, ')')
161 ])
162 else:
163 result.append((toknum, tokval))
164 return untokenize(result)
165
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000166def test_main():
167 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000168 print('starting...')
Raymond Hettinger68c04532005-06-10 11:05:19 +0000169
Thomas Wouters89f507f2006-12-13 04:49:30 +0000170 next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
171
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000172 # This displays the tokenization of tokenize_tests.py to stdout, and
173 # regrtest.py checks that this equals the expected output (in the
174 # test/output/ directory).
175 f = open(findfile('tokenize_tests' + os.extsep + 'txt'))
176 tokenize(f.readline)
177 f.close()
178
179 # Now run test_roundtrip() over tokenize_test.py too, and over all
180 # (if the "compiler" resource is enabled) or a small random sample (if
181 # "compiler" is not enabled) of the test*.py files.
182 f = findfile('tokenize_tests' + os.extsep + 'txt')
183 test_roundtrip(f)
184
185 testdir = os.path.dirname(f) or os.curdir
186 testfiles = glob.glob(testdir + os.sep + 'test*.py')
187 if not is_resource_enabled('compiler'):
188 testfiles = random.sample(testfiles, 10)
189
190 for f in testfiles:
Thomas Wouters89f507f2006-12-13 04:49:30 +0000191 # Print still working message since this test can be really slow
Guido van Rossumc43e79f2007-06-18 18:26:36 +0000192 if verbose:
193 print(' round trip: ', f, file=sys.__stdout__)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000194 if next_time <= time.time():
195 next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000196 print(' test_main still working, be patient...', file=sys.__stdout__)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000197 sys.__stdout__.flush()
198
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000199 test_roundtrip(f)
200
201 # Test detecton of IndentationError.
202 sampleBadText = """\
203def foo():
204 bar
205 baz
206"""
207
208 try:
209 for tok in generate_tokens(StringIO(sampleBadText).readline):
210 pass
211 except IndentationError:
212 pass
213 else:
214 raise TestFailed("Did not detect IndentationError:")
215
216 # Run the doctests in this module.
217 from test import test_tokenize # i.e., this module
218 from test.test_support import run_doctest
Thomas Wouters89f507f2006-12-13 04:49:30 +0000219 run_doctest(test_tokenize, verbose)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000220
221 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000222 print('finished')
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000223
Neal Norwitzc1505362006-12-28 06:47:50 +0000224def test_rarrow():
225 """
226 This function exists solely to test the tokenization of the RARROW
227 operator.
228
Georg Brandla18af4e2007-04-21 15:47:16 +0000229 >>> tokenize(iter(['->']).__next__) #doctest: +NORMALIZE_WHITESPACE
Neal Norwitzc1505362006-12-28 06:47:50 +0000230 1,0-1,2:\tOP\t'->'
231 2,0-2,0:\tENDMARKER\t''
232 """
233
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000234if __name__ == "__main__":
235 test_main()