blob: 788a04b989faeb2cb888980880b3ac56e1b733e1 [file] [log] [blame]
Thomas Wouters89f507f2006-12-13 04:49:30 +00001"""Tests for the tokenize module.
2
3The tests were originally written in the old Python style, where the
4test output was compared to a golden file. This docstring represents
5the first steps towards rewriting the entire test as a doctest.
6
7The tests can be really simple. Given a small fragment of source
8code, print out a table with the tokens. The ENDMARK is omitted for
9brevity.
10
11>>> dump_tokens("1 + 1")
12NUMBER '1' (1, 0) (1, 1)
13OP '+' (1, 2) (1, 3)
14NUMBER '1' (1, 4) (1, 5)
15
16A comment generates a token here, unlike in the parser module. The
17comment token is followed by an NL or a NEWLINE token, depending on
18whether the line contains the completion of a statement.
19
20>>> dump_tokens("if False:\\n"
21... " # NL\\n"
Guido van Rossume7ba4952007-06-06 23:52:48 +000022... " a = False # NEWLINE\\n")
Thomas Wouters89f507f2006-12-13 04:49:30 +000023NAME 'if' (1, 0) (1, 2)
24NAME 'False' (1, 3) (1, 8)
25OP ':' (1, 8) (1, 9)
26NEWLINE '\\n' (1, 9) (1, 10)
27COMMENT '# NL' (2, 4) (2, 8)
28NL '\\n' (2, 8) (2, 9)
29INDENT ' ' (3, 0) (3, 4)
Guido van Rossume7ba4952007-06-06 23:52:48 +000030NAME 'a' (3, 4) (3, 5)
Thomas Wouters89f507f2006-12-13 04:49:30 +000031OP '=' (3, 9) (3, 10)
32NAME 'False' (3, 11) (3, 16)
33COMMENT '# NEWLINE' (3, 17) (3, 26)
34NEWLINE '\\n' (3, 26) (3, 27)
35DEDENT '' (4, 0) (4, 0)
36
Guido van Rossuma6bcefc2007-08-01 18:06:13 +000037' # Emacs hint
Thomas Wouters89f507f2006-12-13 04:49:30 +000038
39There will be a bunch more tests of specific source patterns.
40
41The tokenize module also defines an untokenize function that should
42regenerate the original program text from the tokens.
43
44There are some standard formatting practices that are easy to get right.
45
46>>> roundtrip("if x == 1:\\n"
Georg Brandl88fc6642007-02-09 21:28:07 +000047... " print(x)\\n")
Thomas Wouters89f507f2006-12-13 04:49:30 +000048if x == 1:
Georg Brandl88fc6642007-02-09 21:28:07 +000049 print(x)
Thomas Wouters89f507f2006-12-13 04:49:30 +000050
51Some people use different formatting conventions, which makes
52untokenize a little trickier. Note that this test involves trailing
53whitespace after the colon. Note that we use hex escapes to make the
54two trailing blanks apparent in the expected output.
55
56>>> roundtrip("if x == 1 : \\n"
Georg Brandl88fc6642007-02-09 21:28:07 +000057... " print(x)\\n")
Thomas Wouters89f507f2006-12-13 04:49:30 +000058if x == 1 :\x20\x20
Georg Brandl88fc6642007-02-09 21:28:07 +000059 print(x)
Thomas Wouters89f507f2006-12-13 04:49:30 +000060
61Comments need to go in the right place.
62
63>>> roundtrip("if x == 1:\\n"
64... " # A comment by itself.\\n"
Georg Brandl88fc6642007-02-09 21:28:07 +000065... " print(x) # Comment here, too.\\n"
Thomas Wouters89f507f2006-12-13 04:49:30 +000066... " # Another comment.\\n"
67... "after_if = True\\n")
68if x == 1:
69 # A comment by itself.
Georg Brandl88fc6642007-02-09 21:28:07 +000070 print(x) # Comment here, too.
Thomas Wouters89f507f2006-12-13 04:49:30 +000071 # Another comment.
72after_if = True
73
74>>> roundtrip("if (x # The comments need to go in the right place\\n"
75... " == 1):\\n"
Georg Brandl88fc6642007-02-09 21:28:07 +000076... " print('x == 1')\\n")
Thomas Wouters89f507f2006-12-13 04:49:30 +000077if (x # The comments need to go in the right place
78 == 1):
Georg Brandl88fc6642007-02-09 21:28:07 +000079 print('x == 1')
Thomas Wouters89f507f2006-12-13 04:49:30 +000080
81"""
82
Guido van Rossumcfbbf482007-08-04 17:43:15 +000083# ' Emacs hint
84
Thomas Wouters89f507f2006-12-13 04:49:30 +000085import os, glob, random, time, sys
Guido van Rossumcfbbf482007-08-04 17:43:15 +000086import re
Guido van Rossumc43e79f2007-06-18 18:26:36 +000087from io import StringIO
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088from test.test_support import (verbose, findfile, is_resource_enabled,
89 TestFailed)
Thomas Wouters89f507f2006-12-13 04:49:30 +000090from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
91 ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)
92
93# How much time in seconds can pass before we print a 'Still working' message.
94_PRINT_WORKING_MSG_INTERVAL = 5 * 60
Guido van Rossum0874f7f1997-10-27 22:15:06 +000095
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000096# Test roundtrip for `untokenize`. `f` is a file path. The source code in f
97# is tokenized, converted back to source code via tokenize.untokenize(),
98# and tokenized again from the latter. The test fails if the second
99# tokenization doesn't match the first.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000100def test_roundtrip(f):
101 ## print 'Testing:', f
Guido van Rossumcfbbf482007-08-04 17:43:15 +0000102 # Get the encoding first
103 fobj = open(f, encoding="latin-1")
104 first2lines = fobj.readline() + fobj.readline()
105 fobj.close()
106 m = re.search(r"coding:\s*(\S+)", first2lines)
107 if m:
108 encoding = m.group(1)
109 print(" coding:", encoding)
110 else:
111 encoding = "utf-8"
112 fobj = open(f, encoding=encoding)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000113 try:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000114 fulltok = list(generate_tokens(fobj.readline))
Raymond Hettinger68c04532005-06-10 11:05:19 +0000115 finally:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000116 fobj.close()
Raymond Hettinger68c04532005-06-10 11:05:19 +0000117
118 t1 = [tok[:2] for tok in fulltok]
119 newtext = untokenize(t1)
Georg Brandla18af4e2007-04-21 15:47:16 +0000120 readline = iter(newtext.splitlines(1)).__next__
Raymond Hettinger68c04532005-06-10 11:05:19 +0000121 t2 = [tok[:2] for tok in generate_tokens(readline)]
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000122 if t1 != t2:
123 raise TestFailed("untokenize() roundtrip failed for %r" % f)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000124
Thomas Wouters89f507f2006-12-13 04:49:30 +0000125def dump_tokens(s):
126 """Print out the tokens in s in a table format.
127
128 The ENDMARKER is omitted.
129 """
130 f = StringIO(s)
131 for type, token, start, end, line in generate_tokens(f.readline):
132 if type == ENDMARKER:
133 break
134 type = tok_name[type]
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000135 print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters89f507f2006-12-13 04:49:30 +0000136
137def roundtrip(s):
138 f = StringIO(s)
139 source = untokenize(generate_tokens(f.readline))
Guido van Rossum0bcbb0d2007-02-09 22:43:10 +0000140 print(source, end="")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000141
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000142# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000143def decistmt(s):
144 """Substitute Decimals for floats in a string of statements.
145
146 >>> from decimal import Decimal
Georg Brandl88fc6642007-02-09 21:28:07 +0000147 >>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000148 >>> decistmt(s)
Georg Brandl88fc6642007-02-09 21:28:07 +0000149 "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger68c04532005-06-10 11:05:19 +0000150
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000151 The format of the exponent is inherited from the platform C library.
152 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
153 we're only showing 12 digits, and the 13th isn't close to 5, the
154 rest of the output should be platform-independent.
155
156 >>> exec(s) #doctest: +ELLIPSIS
157 -3.21716034272e-0...7
158
159 Output from calculations with Decimal should be identical across all
160 platforms.
161
Raymond Hettinger68c04532005-06-10 11:05:19 +0000162 >>> exec(decistmt(s))
163 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +0000164 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000165
Raymond Hettinger68c04532005-06-10 11:05:19 +0000166 result = []
167 g = generate_tokens(StringIO(s).readline) # tokenize the string
168 for toknum, tokval, _, _, _ in g:
169 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
170 result.extend([
171 (NAME, 'Decimal'),
172 (OP, '('),
173 (STRING, repr(tokval)),
174 (OP, ')')
175 ])
176 else:
177 result.append((toknum, tokval))
178 return untokenize(result)
179
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000180def test_main():
181 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000182 print('starting...')
Raymond Hettinger68c04532005-06-10 11:05:19 +0000183
Thomas Wouters89f507f2006-12-13 04:49:30 +0000184 next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
185
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000186 # This displays the tokenization of tokenize_tests.py to stdout, and
187 # regrtest.py checks that this equals the expected output (in the
188 # test/output/ directory).
189 f = open(findfile('tokenize_tests' + os.extsep + 'txt'))
190 tokenize(f.readline)
191 f.close()
192
193 # Now run test_roundtrip() over tokenize_test.py too, and over all
194 # (if the "compiler" resource is enabled) or a small random sample (if
195 # "compiler" is not enabled) of the test*.py files.
196 f = findfile('tokenize_tests' + os.extsep + 'txt')
197 test_roundtrip(f)
198
199 testdir = os.path.dirname(f) or os.curdir
200 testfiles = glob.glob(testdir + os.sep + 'test*.py')
201 if not is_resource_enabled('compiler'):
202 testfiles = random.sample(testfiles, 10)
203
204 for f in testfiles:
Thomas Wouters89f507f2006-12-13 04:49:30 +0000205 # Print still working message since this test can be really slow
Guido van Rossumc43e79f2007-06-18 18:26:36 +0000206 if verbose:
207 print(' round trip: ', f, file=sys.__stdout__)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000208 if next_time <= time.time():
209 next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000210 print(' test_main still working, be patient...', file=sys.__stdout__)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000211 sys.__stdout__.flush()
212
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000213 test_roundtrip(f)
214
215 # Test detecton of IndentationError.
216 sampleBadText = """\
217def foo():
218 bar
219 baz
220"""
221
222 try:
223 for tok in generate_tokens(StringIO(sampleBadText).readline):
224 pass
225 except IndentationError:
226 pass
227 else:
228 raise TestFailed("Did not detect IndentationError:")
229
230 # Run the doctests in this module.
231 from test import test_tokenize # i.e., this module
232 from test.test_support import run_doctest
Thomas Wouters89f507f2006-12-13 04:49:30 +0000233 run_doctest(test_tokenize, verbose)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000234
235 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000236 print('finished')
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000237
Neal Norwitzc1505362006-12-28 06:47:50 +0000238def test_rarrow():
239 """
240 This function exists solely to test the tokenization of the RARROW
241 operator.
242
Georg Brandla18af4e2007-04-21 15:47:16 +0000243 >>> tokenize(iter(['->']).__next__) #doctest: +NORMALIZE_WHITESPACE
Neal Norwitzc1505362006-12-28 06:47:50 +0000244 1,0-1,2:\tOP\t'->'
245 2,0-2,0:\tENDMARKER\t''
246 """
247
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000248if __name__ == "__main__":
249 test_main()