blob: 86f1b9b404fda1646fa84e27f8fea8a9126ef510 [file] [log] [blame]
Jeremy Hylton29bef0b2006-08-23 18:37:43 +00001"""Tests for the tokenize module.
2
3The tests were originally written in the old Python style, where the
4test output was compared to a golden file. This docstring represents
5the first steps towards rewriting the entire test as a doctest.
6
7The tests can be really simple. Given a small fragment of source
8code, print out a table with the tokens. The ENDMARK is omitted for
9brevity.
10
11>>> dump_tokens("1 + 1")
Jeremy Hylton76467ba2006-08-23 21:14:03 +000012NUMBER '1' (1, 0) (1, 1)
13OP '+' (1, 2) (1, 3)
14NUMBER '1' (1, 4) (1, 5)
15
16A comment generates a token here, unlike in the parser module. The
17comment token is followed by an NL or a NEWLINE token, depending on
18whether the line contains the completion of a statement.
19
20>>> dump_tokens("if False:\\n"
21... " # NL\\n"
22... " True = False # NEWLINE\\n")
23NAME 'if' (1, 0) (1, 2)
24NAME 'False' (1, 3) (1, 8)
25OP ':' (1, 8) (1, 9)
26NEWLINE '\\n' (1, 9) (1, 10)
27COMMENT '# NL' (2, 4) (2, 8)
28NL '\\n' (2, 8) (2, 9)
29INDENT ' ' (3, 0) (3, 4)
30NAME 'True' (3, 4) (3, 8)
31OP '=' (3, 9) (3, 10)
32NAME 'False' (3, 11) (3, 16)
33COMMENT '# NEWLINE' (3, 17) (3, 26)
34NEWLINE '\\n' (3, 26) (3, 27)
35DEDENT '' (4, 0) (4, 0)
36
Jeremy Hylton29bef0b2006-08-23 18:37:43 +000037
38There will be a bunch more tests of specific source patterns.
39
40The tokenize module also defines an untokenize function that should
Jeremy Hylton76467ba2006-08-23 21:14:03 +000041regenerate the original program text from the tokens.
42
43There are some standard formatting practices that are easy to get right.
Jeremy Hylton29bef0b2006-08-23 18:37:43 +000044
45>>> roundtrip("if x == 1:\\n"
46... " print x\\n")
Jeremy Hylton76467ba2006-08-23 21:14:03 +000047if x == 1:
48 print x
49
50Some people use different formatting conventions, which makes
51untokenize a little trickier. Note that this test involves trailing
52whitespace after the colon. You can't see it, but it's there!
53
54>>> roundtrip("if x == 1 : \\n"
55... " print x\\n")
56if x == 1 :
57 print x
58
59Comments need to go in the right place.
60
61>>> roundtrip("if x == 1:\\n"
62... " # A comment by itself.\\n"
63... " print x # Comment here, too.\\n"
64... " # Another comment.\\n"
65... "after_if = True\\n")
66if x == 1:
67 # A comment by itself.
68 print x # Comment here, too.
69 # Another comment.
70after_if = True
71
72>>> roundtrip("if (x # The comments need to go in the right place\\n"
73... " == 1):\\n"
74... " print 'x == 1'\\n")
75if (x # The comments need to go in the right place
76 == 1):
77 print 'x == 1'
78
Jeremy Hylton29bef0b2006-08-23 18:37:43 +000079"""
80
Raymond Hettinger68c04532005-06-10 11:05:19 +000081import os, glob, random
Tim Petersef575672006-03-31 03:17:30 +000082from cStringIO import StringIO
83from test.test_support import (verbose, findfile, is_resource_enabled,
84 TestFailed)
Jeremy Hylton29bef0b2006-08-23 18:37:43 +000085from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
Jeremy Hylton76467ba2006-08-23 21:14:03 +000086 ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)
Guido van Rossum0874f7f1997-10-27 22:15:06 +000087
Tim Petersef575672006-03-31 03:17:30 +000088# Test roundtrip for `untokenize`. `f` is a file path. The source code in f
89# is tokenized, converted back to source code via tokenize.untokenize(),
90# and tokenized again from the latter. The test fails if the second
91# tokenization doesn't match the first.
Raymond Hettinger68c04532005-06-10 11:05:19 +000092def test_roundtrip(f):
93 ## print 'Testing:', f
Tim Petersef575672006-03-31 03:17:30 +000094 fobj = open(f)
Raymond Hettinger68c04532005-06-10 11:05:19 +000095 try:
Tim Petersef575672006-03-31 03:17:30 +000096 fulltok = list(generate_tokens(fobj.readline))
Raymond Hettinger68c04532005-06-10 11:05:19 +000097 finally:
Tim Petersef575672006-03-31 03:17:30 +000098 fobj.close()
Raymond Hettinger68c04532005-06-10 11:05:19 +000099
100 t1 = [tok[:2] for tok in fulltok]
101 newtext = untokenize(t1)
102 readline = iter(newtext.splitlines(1)).next
103 t2 = [tok[:2] for tok in generate_tokens(readline)]
Tim Petersef575672006-03-31 03:17:30 +0000104 if t1 != t2:
105 raise TestFailed("untokenize() roundtrip failed for %r" % f)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000106
Jeremy Hylton29bef0b2006-08-23 18:37:43 +0000107def dump_tokens(s):
108 """Print out the tokens in s in a table format.
109
110 The ENDMARKER is omitted.
111 """
112 f = StringIO(s)
113 for type, token, start, end, line in generate_tokens(f.readline):
114 if type == ENDMARKER:
115 break
116 type = tok_name[type]
Jeremy Hylton76467ba2006-08-23 21:14:03 +0000117 print "%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals()
Jeremy Hylton29bef0b2006-08-23 18:37:43 +0000118
119def roundtrip(s):
120 f = StringIO(s)
Jeremy Hylton76467ba2006-08-23 21:14:03 +0000121 source = untokenize(generate_tokens(f.readline))
122 print source,
Jeremy Hylton29bef0b2006-08-23 18:37:43 +0000123
Tim Petersef575672006-03-31 03:17:30 +0000124# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000125def decistmt(s):
126 """Substitute Decimals for floats in a string of statements.
127
128 >>> from decimal import Decimal
129 >>> s = 'print +21.3e-5*-.1234/81.7'
130 >>> decistmt(s)
131 "print +Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')"
132
Tim Petersef575672006-03-31 03:17:30 +0000133 The format of the exponent is inherited from the platform C library.
134 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
135 we're only showing 12 digits, and the 13th isn't close to 5, the
136 rest of the output should be platform-independent.
137
138 >>> exec(s) #doctest: +ELLIPSIS
139 -3.21716034272e-0...7
140
141 Output from calculations with Decimal should be identical across all
142 platforms.
143
Raymond Hettinger68c04532005-06-10 11:05:19 +0000144 >>> exec(decistmt(s))
145 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +0000146 """
Tim Petersef575672006-03-31 03:17:30 +0000147
Raymond Hettinger68c04532005-06-10 11:05:19 +0000148 result = []
149 g = generate_tokens(StringIO(s).readline) # tokenize the string
150 for toknum, tokval, _, _, _ in g:
151 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
152 result.extend([
153 (NAME, 'Decimal'),
154 (OP, '('),
155 (STRING, repr(tokval)),
156 (OP, ')')
157 ])
158 else:
159 result.append((toknum, tokval))
160 return untokenize(result)
161
Tim Petersef575672006-03-31 03:17:30 +0000162def test_main():
163 if verbose:
164 print 'starting...'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000165
Tim Petersef575672006-03-31 03:17:30 +0000166 # This displays the tokenization of tokenize_tests.py to stdout, and
167 # regrtest.py checks that this equals the expected output (in the
168 # test/output/ directory).
169 f = open(findfile('tokenize_tests' + os.extsep + 'txt'))
170 tokenize(f.readline)
171 f.close()
172
173 # Now run test_roundtrip() over tokenize_test.py too, and over all
174 # (if the "compiler" resource is enabled) or a small random sample (if
175 # "compiler" is not enabled) of the test*.py files.
176 f = findfile('tokenize_tests' + os.extsep + 'txt')
177 test_roundtrip(f)
178
179 testdir = os.path.dirname(f) or os.curdir
180 testfiles = glob.glob(testdir + os.sep + 'test*.py')
181 if not is_resource_enabled('compiler'):
182 testfiles = random.sample(testfiles, 10)
183
184 for f in testfiles:
185 test_roundtrip(f)
186
187 # Test detecton of IndentationError.
188 sampleBadText = """\
189def foo():
190 bar
191 baz
192"""
193
194 try:
195 for tok in generate_tokens(StringIO(sampleBadText).readline):
196 pass
197 except IndentationError:
198 pass
199 else:
200 raise TestFailed("Did not detect IndentationError:")
201
202 # Run the doctests in this module.
203 from test import test_tokenize # i.e., this module
204 from test.test_support import run_doctest
Jeremy Hylton29bef0b2006-08-23 18:37:43 +0000205 run_doctest(test_tokenize, verbose)
Tim Petersef575672006-03-31 03:17:30 +0000206
207 if verbose:
208 print 'finished'
209
210if __name__ == "__main__":
211 test_main()