blob: 3fa792746ee8369cc460b5b924bdeb06143d234e [file] [log] [blame]
Jeremy Hylton29bef0b2006-08-23 18:37:43 +00001"""Tests for the tokenize module.
2
3The tests were originally written in the old Python style, where the
4test output was compared to a golden file. This docstring represents
5the first steps towards rewriting the entire test as a doctest.
6
7The tests can be really simple. Given a small fragment of source
8code, print out a table with the tokens. The ENDMARK is omitted for
9brevity.
10
11>>> dump_tokens("1 + 1")
Jeremy Hylton76467ba2006-08-23 21:14:03 +000012NUMBER '1' (1, 0) (1, 1)
13OP '+' (1, 2) (1, 3)
14NUMBER '1' (1, 4) (1, 5)
15
16A comment generates a token here, unlike in the parser module. The
17comment token is followed by an NL or a NEWLINE token, depending on
18whether the line contains the completion of a statement.
19
20>>> dump_tokens("if False:\\n"
21... " # NL\\n"
22... " True = False # NEWLINE\\n")
23NAME 'if' (1, 0) (1, 2)
24NAME 'False' (1, 3) (1, 8)
25OP ':' (1, 8) (1, 9)
26NEWLINE '\\n' (1, 9) (1, 10)
27COMMENT '# NL' (2, 4) (2, 8)
28NL '\\n' (2, 8) (2, 9)
29INDENT ' ' (3, 0) (3, 4)
30NAME 'True' (3, 4) (3, 8)
31OP '=' (3, 9) (3, 10)
32NAME 'False' (3, 11) (3, 16)
33COMMENT '# NEWLINE' (3, 17) (3, 26)
34NEWLINE '\\n' (3, 26) (3, 27)
35DEDENT '' (4, 0) (4, 0)
Tim Peters147f9ae2006-08-25 22:05:39 +000036
Jeremy Hylton29bef0b2006-08-23 18:37:43 +000037
38There will be a bunch more tests of specific source patterns.
39
40The tokenize module also defines an untokenize function that should
Jeremy Hylton76467ba2006-08-23 21:14:03 +000041regenerate the original program text from the tokens.
42
43There are some standard formatting practices that are easy to get right.
Jeremy Hylton29bef0b2006-08-23 18:37:43 +000044
45>>> roundtrip("if x == 1:\\n"
Tim Peters147f9ae2006-08-25 22:05:39 +000046... " print x\\n")
Jeremy Hylton76467ba2006-08-23 21:14:03 +000047if x == 1:
48 print x
49
50Some people use different formatting conventions, which makes
51untokenize a little trickier. Note that this test involves trailing
Tim Peters4582d7d2006-08-25 22:26:21 +000052whitespace after the colon. Note that we use hex escapes to make the
53two trailing blanks apparent in the expected output.
Jeremy Hylton76467ba2006-08-23 21:14:03 +000054
55>>> roundtrip("if x == 1 : \\n"
56... " print x\\n")
Tim Peters4582d7d2006-08-25 22:26:21 +000057if x == 1 :\x20\x20
Jeremy Hylton76467ba2006-08-23 21:14:03 +000058 print x
59
60Comments need to go in the right place.
61
62>>> roundtrip("if x == 1:\\n"
63... " # A comment by itself.\\n"
64... " print x # Comment here, too.\\n"
65... " # Another comment.\\n"
66... "after_if = True\\n")
67if x == 1:
68 # A comment by itself.
69 print x # Comment here, too.
70 # Another comment.
71after_if = True
72
73>>> roundtrip("if (x # The comments need to go in the right place\\n"
74... " == 1):\\n"
75... " print 'x == 1'\\n")
76if (x # The comments need to go in the right place
77 == 1):
78 print 'x == 1'
79
Jeremy Hylton29bef0b2006-08-23 18:37:43 +000080"""
81
Raymond Hettinger68c04532005-06-10 11:05:19 +000082import os, glob, random
Tim Petersef575672006-03-31 03:17:30 +000083from cStringIO import StringIO
84from test.test_support import (verbose, findfile, is_resource_enabled,
85 TestFailed)
Jeremy Hylton29bef0b2006-08-23 18:37:43 +000086from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
Jeremy Hylton76467ba2006-08-23 21:14:03 +000087 ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)
Guido van Rossum0874f7f1997-10-27 22:15:06 +000088
Tim Petersef575672006-03-31 03:17:30 +000089# Test roundtrip for `untokenize`. `f` is a file path. The source code in f
90# is tokenized, converted back to source code via tokenize.untokenize(),
91# and tokenized again from the latter. The test fails if the second
92# tokenization doesn't match the first.
Raymond Hettinger68c04532005-06-10 11:05:19 +000093def test_roundtrip(f):
94 ## print 'Testing:', f
Tim Petersef575672006-03-31 03:17:30 +000095 fobj = open(f)
Raymond Hettinger68c04532005-06-10 11:05:19 +000096 try:
Tim Petersef575672006-03-31 03:17:30 +000097 fulltok = list(generate_tokens(fobj.readline))
Raymond Hettinger68c04532005-06-10 11:05:19 +000098 finally:
Tim Petersef575672006-03-31 03:17:30 +000099 fobj.close()
Raymond Hettinger68c04532005-06-10 11:05:19 +0000100
101 t1 = [tok[:2] for tok in fulltok]
102 newtext = untokenize(t1)
103 readline = iter(newtext.splitlines(1)).next
104 t2 = [tok[:2] for tok in generate_tokens(readline)]
Tim Petersef575672006-03-31 03:17:30 +0000105 if t1 != t2:
106 raise TestFailed("untokenize() roundtrip failed for %r" % f)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000107
Jeremy Hylton29bef0b2006-08-23 18:37:43 +0000108def dump_tokens(s):
109 """Print out the tokens in s in a table format.
110
111 The ENDMARKER is omitted.
112 """
113 f = StringIO(s)
114 for type, token, start, end, line in generate_tokens(f.readline):
115 if type == ENDMARKER:
116 break
117 type = tok_name[type]
Jeremy Hylton76467ba2006-08-23 21:14:03 +0000118 print "%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals()
Jeremy Hylton29bef0b2006-08-23 18:37:43 +0000119
120def roundtrip(s):
121 f = StringIO(s)
Jeremy Hylton76467ba2006-08-23 21:14:03 +0000122 source = untokenize(generate_tokens(f.readline))
123 print source,
Jeremy Hylton29bef0b2006-08-23 18:37:43 +0000124
Tim Petersef575672006-03-31 03:17:30 +0000125# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000126def decistmt(s):
127 """Substitute Decimals for floats in a string of statements.
128
129 >>> from decimal import Decimal
130 >>> s = 'print +21.3e-5*-.1234/81.7'
131 >>> decistmt(s)
132 "print +Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')"
133
Tim Petersef575672006-03-31 03:17:30 +0000134 The format of the exponent is inherited from the platform C library.
135 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
136 we're only showing 12 digits, and the 13th isn't close to 5, the
137 rest of the output should be platform-independent.
138
139 >>> exec(s) #doctest: +ELLIPSIS
140 -3.21716034272e-0...7
141
142 Output from calculations with Decimal should be identical across all
143 platforms.
144
Raymond Hettinger68c04532005-06-10 11:05:19 +0000145 >>> exec(decistmt(s))
146 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +0000147 """
Tim Petersef575672006-03-31 03:17:30 +0000148
Raymond Hettinger68c04532005-06-10 11:05:19 +0000149 result = []
150 g = generate_tokens(StringIO(s).readline) # tokenize the string
151 for toknum, tokval, _, _, _ in g:
152 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
153 result.extend([
154 (NAME, 'Decimal'),
155 (OP, '('),
156 (STRING, repr(tokval)),
157 (OP, ')')
158 ])
159 else:
160 result.append((toknum, tokval))
161 return untokenize(result)
162
Tim Petersef575672006-03-31 03:17:30 +0000163def test_main():
164 if verbose:
165 print 'starting...'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000166
Tim Petersef575672006-03-31 03:17:30 +0000167 # This displays the tokenization of tokenize_tests.py to stdout, and
168 # regrtest.py checks that this equals the expected output (in the
169 # test/output/ directory).
170 f = open(findfile('tokenize_tests' + os.extsep + 'txt'))
171 tokenize(f.readline)
172 f.close()
173
174 # Now run test_roundtrip() over tokenize_test.py too, and over all
175 # (if the "compiler" resource is enabled) or a small random sample (if
176 # "compiler" is not enabled) of the test*.py files.
177 f = findfile('tokenize_tests' + os.extsep + 'txt')
178 test_roundtrip(f)
179
180 testdir = os.path.dirname(f) or os.curdir
181 testfiles = glob.glob(testdir + os.sep + 'test*.py')
182 if not is_resource_enabled('compiler'):
183 testfiles = random.sample(testfiles, 10)
184
185 for f in testfiles:
186 test_roundtrip(f)
187
188 # Test detecton of IndentationError.
189 sampleBadText = """\
190def foo():
191 bar
192 baz
193"""
194
195 try:
196 for tok in generate_tokens(StringIO(sampleBadText).readline):
197 pass
198 except IndentationError:
199 pass
200 else:
201 raise TestFailed("Did not detect IndentationError:")
202
203 # Run the doctests in this module.
204 from test import test_tokenize # i.e., this module
205 from test.test_support import run_doctest
Jeremy Hylton29bef0b2006-08-23 18:37:43 +0000206 run_doctest(test_tokenize, verbose)
Tim Petersef575672006-03-31 03:17:30 +0000207
208 if verbose:
209 print 'finished'
210
211if __name__ == "__main__":
212 test_main()