blob: a0f61d7cf6a3dd1a38f6bb204fffea8ce48d9678 [file] [log] [blame]
Jeremy Hylton29bef0b2006-08-23 18:37:43 +00001"""Tests for the tokenize module.
2
3The tests were originally written in the old Python style, where the
4test output was compared to a golden file. This docstring represents
5the first steps towards rewriting the entire test as a doctest.
6
7The tests can be really simple. Given a small fragment of source
8code, print out a table with the tokens. The ENDMARK is omitted for
9brevity.
10
11>>> dump_tokens("1 + 1")
12NUMBER '1' (1, 0) (1, 1)
13OP '+' (1, 2) (1, 3)
14NUMBER '1' (1, 4) (1, 5)
15
16There will be a bunch more tests of specific source patterns.
17
18The tokenize module also defines an untokenize function that should
19regenerate the original program text from the tokens. (It doesn't
20work very well at the moment.)
21
22>>> roundtrip("if x == 1:\\n"
23... " print x\\n")
24if x ==1 :
25 print x
26"""
27
Raymond Hettinger68c04532005-06-10 11:05:19 +000028import os, glob, random
Tim Petersef575672006-03-31 03:17:30 +000029from cStringIO import StringIO
30from test.test_support import (verbose, findfile, is_resource_enabled,
31 TestFailed)
Jeremy Hylton29bef0b2006-08-23 18:37:43 +000032from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
33 ENDMARKER, NUMBER, NAME, OP, STRING)
Guido van Rossum0874f7f1997-10-27 22:15:06 +000034
Tim Petersef575672006-03-31 03:17:30 +000035# Test roundtrip for `untokenize`. `f` is a file path. The source code in f
36# is tokenized, converted back to source code via tokenize.untokenize(),
37# and tokenized again from the latter. The test fails if the second
38# tokenization doesn't match the first.
Raymond Hettinger68c04532005-06-10 11:05:19 +000039def test_roundtrip(f):
40 ## print 'Testing:', f
Tim Petersef575672006-03-31 03:17:30 +000041 fobj = open(f)
Raymond Hettinger68c04532005-06-10 11:05:19 +000042 try:
Tim Petersef575672006-03-31 03:17:30 +000043 fulltok = list(generate_tokens(fobj.readline))
Raymond Hettinger68c04532005-06-10 11:05:19 +000044 finally:
Tim Petersef575672006-03-31 03:17:30 +000045 fobj.close()
Raymond Hettinger68c04532005-06-10 11:05:19 +000046
47 t1 = [tok[:2] for tok in fulltok]
48 newtext = untokenize(t1)
49 readline = iter(newtext.splitlines(1)).next
50 t2 = [tok[:2] for tok in generate_tokens(readline)]
Tim Petersef575672006-03-31 03:17:30 +000051 if t1 != t2:
52 raise TestFailed("untokenize() roundtrip failed for %r" % f)
Raymond Hettinger68c04532005-06-10 11:05:19 +000053
Jeremy Hylton29bef0b2006-08-23 18:37:43 +000054def dump_tokens(s):
55 """Print out the tokens in s in a table format.
56
57 The ENDMARKER is omitted.
58 """
59 f = StringIO(s)
60 for type, token, start, end, line in generate_tokens(f.readline):
61 if type == ENDMARKER:
62 break
63 type = tok_name[type]
64 print "%(type)-10.10s %(token)-10.10r %(start)s %(end)s" % locals()
65
66def roundtrip(s):
67 f = StringIO(s)
68 print untokenize(generate_tokens(f.readline)),
69
Tim Petersef575672006-03-31 03:17:30 +000070# This is an example from the docs, set up as a doctest.
Raymond Hettinger68c04532005-06-10 11:05:19 +000071def decistmt(s):
72 """Substitute Decimals for floats in a string of statements.
73
74 >>> from decimal import Decimal
75 >>> s = 'print +21.3e-5*-.1234/81.7'
76 >>> decistmt(s)
77 "print +Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')"
78
Tim Petersef575672006-03-31 03:17:30 +000079 The format of the exponent is inherited from the platform C library.
80 Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
81 we're only showing 12 digits, and the 13th isn't close to 5, the
82 rest of the output should be platform-independent.
83
84 >>> exec(s) #doctest: +ELLIPSIS
85 -3.21716034272e-0...7
86
87 Output from calculations with Decimal should be identical across all
88 platforms.
89
Raymond Hettinger68c04532005-06-10 11:05:19 +000090 >>> exec(decistmt(s))
91 -3.217160342717258261933904529E-7
Raymond Hettinger68c04532005-06-10 11:05:19 +000092 """
Tim Petersef575672006-03-31 03:17:30 +000093
Raymond Hettinger68c04532005-06-10 11:05:19 +000094 result = []
95 g = generate_tokens(StringIO(s).readline) # tokenize the string
96 for toknum, tokval, _, _, _ in g:
97 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
98 result.extend([
99 (NAME, 'Decimal'),
100 (OP, '('),
101 (STRING, repr(tokval)),
102 (OP, ')')
103 ])
104 else:
105 result.append((toknum, tokval))
106 return untokenize(result)
107
Tim Petersef575672006-03-31 03:17:30 +0000108def test_main():
109 if verbose:
110 print 'starting...'
Raymond Hettinger68c04532005-06-10 11:05:19 +0000111
Tim Petersef575672006-03-31 03:17:30 +0000112 # This displays the tokenization of tokenize_tests.py to stdout, and
113 # regrtest.py checks that this equals the expected output (in the
114 # test/output/ directory).
115 f = open(findfile('tokenize_tests' + os.extsep + 'txt'))
116 tokenize(f.readline)
117 f.close()
118
119 # Now run test_roundtrip() over tokenize_test.py too, and over all
120 # (if the "compiler" resource is enabled) or a small random sample (if
121 # "compiler" is not enabled) of the test*.py files.
122 f = findfile('tokenize_tests' + os.extsep + 'txt')
123 test_roundtrip(f)
124
125 testdir = os.path.dirname(f) or os.curdir
126 testfiles = glob.glob(testdir + os.sep + 'test*.py')
127 if not is_resource_enabled('compiler'):
128 testfiles = random.sample(testfiles, 10)
129
130 for f in testfiles:
131 test_roundtrip(f)
132
133 # Test detecton of IndentationError.
134 sampleBadText = """\
135def foo():
136 bar
137 baz
138"""
139
140 try:
141 for tok in generate_tokens(StringIO(sampleBadText).readline):
142 pass
143 except IndentationError:
144 pass
145 else:
146 raise TestFailed("Did not detect IndentationError:")
147
148 # Run the doctests in this module.
149 from test import test_tokenize # i.e., this module
150 from test.test_support import run_doctest
Jeremy Hylton29bef0b2006-08-23 18:37:43 +0000151 run_doctest(test_tokenize, verbose)
Tim Petersef575672006-03-31 03:17:30 +0000152
153 if verbose:
154 print 'finished'
155
156if __name__ == "__main__":
157 test_main()