blob: be6c18dab21afa68234e415e70f73b8f4c0e5ff7 [file] [log] [blame]
Jeremy Hylton29bef0b2006-08-23 18:37:43 +00001"""Tests for the tokenize module.
2
3The tests were originally written in the old Python style, where the
4test output was compared to a golden file. This docstring represents
5the first steps towards rewriting the entire test as a doctest.
6
7The tests can be really simple. Given a small fragment of source
8code, print out a table with the tokens. The ENDMARK is omitted for
9brevity.
10
11>>> dump_tokens("1 + 1")
Jeremy Hylton76467ba2006-08-23 21:14:03 +000012NUMBER '1' (1, 0) (1, 1)
13OP '+' (1, 2) (1, 3)
14NUMBER '1' (1, 4) (1, 5)
15
16A comment generates a token here, unlike in the parser module. The
17comment token is followed by an NL or a NEWLINE token, depending on
18whether the line contains the completion of a statement.
19
20>>> dump_tokens("if False:\\n"
21... " # NL\\n"
22... " True = False # NEWLINE\\n")
23NAME 'if' (1, 0) (1, 2)
24NAME 'False' (1, 3) (1, 8)
25OP ':' (1, 8) (1, 9)
26NEWLINE '\\n' (1, 9) (1, 10)
27COMMENT '# NL' (2, 4) (2, 8)
28NL '\\n' (2, 8) (2, 9)
29INDENT ' ' (3, 0) (3, 4)
30NAME 'True' (3, 4) (3, 8)
31OP '=' (3, 9) (3, 10)
32NAME 'False' (3, 11) (3, 16)
33COMMENT '# NEWLINE' (3, 17) (3, 26)
34NEWLINE '\\n' (3, 26) (3, 27)
35DEDENT '' (4, 0) (4, 0)
Tim Peters147f9ae2006-08-25 22:05:39 +000036
Jeremy Hylton29bef0b2006-08-23 18:37:43 +000037
38There will be a bunch more tests of specific source patterns.
39
40The tokenize module also defines an untokenize function that should
Jeremy Hylton76467ba2006-08-23 21:14:03 +000041regenerate the original program text from the tokens.
42
43There are some standard formatting practices that are easy to get right.
Jeremy Hylton29bef0b2006-08-23 18:37:43 +000044
45>>> roundtrip("if x == 1:\\n"
Tim Peters147f9ae2006-08-25 22:05:39 +000046... " print x\\n")
Jeremy Hylton76467ba2006-08-23 21:14:03 +000047if x == 1:
48 print x
49
50Some people use different formatting conventions, which makes
51untokenize a little trickier. Note that this test involves trailing
Tim Peters4582d7d2006-08-25 22:26:21 +000052whitespace after the colon. Note that we use hex escapes to make the
53two trailing blanks apparent in the expected output.
Jeremy Hylton76467ba2006-08-23 21:14:03 +000054
55>>> roundtrip("if x == 1 : \\n"
56... " print x\\n")
Tim Peters4582d7d2006-08-25 22:26:21 +000057if x == 1 :\x20\x20
Jeremy Hylton76467ba2006-08-23 21:14:03 +000058 print x
59
60Comments need to go in the right place.
61
62>>> roundtrip("if x == 1:\\n"
63... " # A comment by itself.\\n"
64... " print x # Comment here, too.\\n"
65... " # Another comment.\\n"
66... "after_if = True\\n")
67if x == 1:
68 # A comment by itself.
69 print x # Comment here, too.
70 # Another comment.
71after_if = True
72
73>>> roundtrip("if (x # The comments need to go in the right place\\n"
74... " == 1):\\n"
75... " print 'x == 1'\\n")
76if (x # The comments need to go in the right place
77 == 1):
78 print 'x == 1'
79
Jeremy Hylton29bef0b2006-08-23 18:37:43 +000080"""
81
Neal Norwitzc1120b42006-09-02 19:40:19 +000082import os, glob, random, time, sys
Tim Petersef575672006-03-31 03:17:30 +000083from cStringIO import StringIO
84from test.test_support import (verbose, findfile, is_resource_enabled,
85 TestFailed)
Jeremy Hylton29bef0b2006-08-23 18:37:43 +000086from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
Jeremy Hylton76467ba2006-08-23 21:14:03 +000087 ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)
Guido van Rossum0874f7f1997-10-27 22:15:06 +000088
Neal Norwitzc1120b42006-09-02 19:40:19 +000089# How much time in seconds can pass before we print a 'Still working' message.
90_PRINT_WORKING_MSG_INTERVAL = 5 * 60
91
Tim Petersef575672006-03-31 03:17:30 +000092# Test roundtrip for `untokenize`. `f` is a file path. The source code in f
93# is tokenized, converted back to source code via tokenize.untokenize(),
94# and tokenized again from the latter. The test fails if the second
95# tokenization doesn't match the first.
Raymond Hettinger68c04532005-06-10 11:05:19 +000096def test_roundtrip(f):
97 ## print 'Testing:', f
Tim Petersef575672006-03-31 03:17:30 +000098 fobj = open(f)
Raymond Hettinger68c04532005-06-10 11:05:19 +000099 try:
Tim Petersef575672006-03-31 03:17:30 +0000100 fulltok = list(generate_tokens(fobj.readline))
Raymond Hettinger68c04532005-06-10 11:05:19 +0000101 finally:
Tim Petersef575672006-03-31 03:17:30 +0000102 fobj.close()
Raymond Hettinger68c04532005-06-10 11:05:19 +0000103
104 t1 = [tok[:2] for tok in fulltok]
105 newtext = untokenize(t1)
106 readline = iter(newtext.splitlines(1)).next
107 t2 = [tok[:2] for tok in generate_tokens(readline)]
Tim Petersef575672006-03-31 03:17:30 +0000108 if t1 != t2:
109 raise TestFailed("untokenize() roundtrip failed for %r" % f)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000110
Jeremy Hylton29bef0b2006-08-23 18:37:43 +0000111def dump_tokens(s):
112 """Print out the tokens in s in a table format.
113
114 The ENDMARKER is omitted.
115 """
116 f = StringIO(s)
117 for type, token, start, end, line in generate_tokens(f.readline):
118 if type == ENDMARKER:
119 break
120 type = tok_name[type]
Jeremy Hylton76467ba2006-08-23 21:14:03 +0000121 print "%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals()
Jeremy Hylton29bef0b2006-08-23 18:37:43 +0000122
123def roundtrip(s):
124 f = StringIO(s)
Jeremy Hylton76467ba2006-08-23 21:14:03 +0000125 source = untokenize(generate_tokens(f.readline))
126 print source,
Jeremy Hylton29bef0b2006-08-23 18:37:43 +0000127
Tim Petersef575672006-03-31 03:17:30 +0000128# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000129def decistmt(s):
130 """Substitute Decimals for floats in a string of statements.
131
132 >>> from decimal import Decimal
133 >>> s = 'print +21.3e-5*-.1234/81.7'
134 >>> decistmt(s)
135 "print +Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')"
136
Tim Petersef575672006-03-31 03:17:30 +0000137 The format of the exponent is inherited from the platform C library.
138 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
139 we're only showing 12 digits, and the 13th isn't close to 5, the
140 rest of the output should be platform-independent.
141
142 >>> exec(s) #doctest: +ELLIPSIS
143 -3.21716034272e-0...7
144
145 Output from calculations with Decimal should be identical across all
146 platforms.
147
Raymond Hettinger68c04532005-06-10 11:05:19 +0000148 >>> exec(decistmt(s))
149 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +0000150 """
Tim Petersef575672006-03-31 03:17:30 +0000151
Raymond Hettinger68c04532005-06-10 11:05:19 +0000152 result = []
153 g = generate_tokens(StringIO(s).readline) # tokenize the string
154 for toknum, tokval, _, _, _ in g:
155 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
156 result.extend([
157 (NAME, 'Decimal'),
158 (OP, '('),
159 (STRING, repr(tokval)),
160 (OP, ')')
161 ])
162 else:
163 result.append((toknum, tokval))
164 return untokenize(result)
165
Tim Petersef575672006-03-31 03:17:30 +0000166def test_main():
167 if verbose:
168 print 'starting...'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000169
Neal Norwitzc1120b42006-09-02 19:40:19 +0000170 next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
171
Tim Petersef575672006-03-31 03:17:30 +0000172 # This displays the tokenization of tokenize_tests.py to stdout, and
173 # regrtest.py checks that this equals the expected output (in the
174 # test/output/ directory).
175 f = open(findfile('tokenize_tests' + os.extsep + 'txt'))
176 tokenize(f.readline)
177 f.close()
178
179 # Now run test_roundtrip() over tokenize_test.py too, and over all
180 # (if the "compiler" resource is enabled) or a small random sample (if
181 # "compiler" is not enabled) of the test*.py files.
182 f = findfile('tokenize_tests' + os.extsep + 'txt')
183 test_roundtrip(f)
184
185 testdir = os.path.dirname(f) or os.curdir
186 testfiles = glob.glob(testdir + os.sep + 'test*.py')
187 if not is_resource_enabled('compiler'):
188 testfiles = random.sample(testfiles, 10)
189
190 for f in testfiles:
Neal Norwitzc1120b42006-09-02 19:40:19 +0000191 # Print still working message since this test can be really slow
192 if next_time <= time.time():
193 next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
194 print >>sys.__stdout__, ' test_main still working, be patient...'
195 sys.__stdout__.flush()
196
Tim Petersef575672006-03-31 03:17:30 +0000197 test_roundtrip(f)
198
199 # Test detecton of IndentationError.
200 sampleBadText = """\
201def foo():
202 bar
203 baz
204"""
205
206 try:
207 for tok in generate_tokens(StringIO(sampleBadText).readline):
208 pass
209 except IndentationError:
210 pass
211 else:
212 raise TestFailed("Did not detect IndentationError:")
213
214 # Run the doctests in this module.
215 from test import test_tokenize # i.e., this module
216 from test.test_support import run_doctest
Jeremy Hylton29bef0b2006-08-23 18:37:43 +0000217 run_doctest(test_tokenize, verbose)
Tim Petersef575672006-03-31 03:17:30 +0000218
219 if verbose:
220 print 'finished'
221
222if __name__ == "__main__":
223 test_main()