Blame - Lib/test/test_tokenize.py - platform/external/python/cpython2

blob: be6c18dab21afa68234e415e70f73b8f4c0e5ff7 [file] [log] [blame]

Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	1	"""Tests for the tokenize module.
				2
				3	The tests were originally written in the old Python style, where the
				4	test output was compared to a golden file. This docstring represents
				5	the first steps towards rewriting the entire test as a doctest.
				6
				7	The tests can be really simple. Given a small fragment of source
				8	code, print out a table with the tokens. The ENDMARK is omitted for
				9	brevity.
				10
				11	>>> dump_tokens("1 + 1")
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame]	12	NUMBER '1' (1, 0) (1, 1)
				13	OP '+' (1, 2) (1, 3)
				14	NUMBER '1' (1, 4) (1, 5)
				15
				16	A comment generates a token here, unlike in the parser module. The
				17	comment token is followed by an NL or a NEWLINE token, depending on
				18	whether the line contains the completion of a statement.
				19
				20	>>> dump_tokens("if False:\\n"
				21	... " # NL\\n"
				22	... " True = False # NEWLINE\\n")
				23	NAME 'if' (1, 0) (1, 2)
				24	NAME 'False' (1, 3) (1, 8)
				25	OP ':' (1, 8) (1, 9)
				26	NEWLINE '\\n' (1, 9) (1, 10)
				27	COMMENT '# NL' (2, 4) (2, 8)
				28	NL '\\n' (2, 8) (2, 9)
				29	INDENT ' ' (3, 0) (3, 4)
				30	NAME 'True' (3, 4) (3, 8)
				31	OP '=' (3, 9) (3, 10)
				32	NAME 'False' (3, 11) (3, 16)
				33	COMMENT '# NEWLINE' (3, 17) (3, 26)
				34	NEWLINE '\\n' (3, 26) (3, 27)
				35	DEDENT '' (4, 0) (4, 0)
Tim Peters	147f9ae	2006-08-25 22:05:39 +0000	[diff] [blame]	36
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	37
				38	There will be a bunch more tests of specific source patterns.
				39
				40	The tokenize module also defines an untokenize function that should
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame]	41	regenerate the original program text from the tokens.
				42
				43	There are some standard formatting practices that are easy to get right.
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	44
				45	>>> roundtrip("if x == 1:\\n"
Tim Peters	147f9ae	2006-08-25 22:05:39 +0000	[diff] [blame]	46	... " print x\\n")
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame]	47	if x == 1:
				48	print x
				49
				50	Some people use different formatting conventions, which makes
				51	untokenize a little trickier. Note that this test involves trailing
Tim Peters	4582d7d	2006-08-25 22:26:21 +0000	[diff] [blame]	52	whitespace after the colon. Note that we use hex escapes to make the
				53	two trailing blanks apparent in the expected output.
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame]	54
				55	>>> roundtrip("if x == 1 : \\n"
				56	... " print x\\n")
Tim Peters	4582d7d	2006-08-25 22:26:21 +0000	[diff] [blame]	57	if x == 1 :\x20\x20
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame]	58	print x
				59
				60	Comments need to go in the right place.
				61
				62	>>> roundtrip("if x == 1:\\n"
				63	... " # A comment by itself.\\n"
				64	... " print x # Comment here, too.\\n"
				65	... " # Another comment.\\n"
				66	... "after_if = True\\n")
				67	if x == 1:
				68	# A comment by itself.
				69	print x # Comment here, too.
				70	# Another comment.
				71	after_if = True
				72
				73	>>> roundtrip("if (x # The comments need to go in the right place\\n"
				74	... " == 1):\\n"
				75	... " print 'x == 1'\\n")
				76	if (x # The comments need to go in the right place
				77	== 1):
				78	print 'x == 1'
				79
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	80	"""
				81
Neal Norwitz	c1120b4	2006-09-02 19:40:19 +0000	[diff] [blame]	82	import os, glob, random, time, sys
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	83	from cStringIO import StringIO
				84	from test.test_support import (verbose, findfile, is_resource_enabled,
				85	TestFailed)
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	86	from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame]	87	ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)
Guido van Rossum	0874f7f	1997-10-27 22:15:06 +0000	[diff] [blame]	88
Neal Norwitz	c1120b4	2006-09-02 19:40:19 +0000	[diff] [blame]	89	# How much time in seconds can pass before we print a 'Still working' message.
				90	_PRINT_WORKING_MSG_INTERVAL = 5 * 60
				91
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	92	# Test roundtrip for `untokenize`. `f` is a file path. The source code in f
				93	# is tokenized, converted back to source code via tokenize.untokenize(),
				94	# and tokenized again from the latter. The test fails if the second
				95	# tokenization doesn't match the first.
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	96	def test_roundtrip(f):
				97	## print 'Testing:', f
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	98	fobj = open(f)
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	99	try:
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	100	fulltok = list(generate_tokens(fobj.readline))
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	101	finally:
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	102	fobj.close()
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	103
				104	t1 = [tok[:2] for tok in fulltok]
				105	newtext = untokenize(t1)
				106	readline = iter(newtext.splitlines(1)).next
				107	t2 = [tok[:2] for tok in generate_tokens(readline)]
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	108	if t1 != t2:
				109	raise TestFailed("untokenize() roundtrip failed for %r" % f)
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	110
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	111	def dump_tokens(s):
				112	"""Print out the tokens in s in a table format.
				113
				114	The ENDMARKER is omitted.
				115	"""
				116	f = StringIO(s)
				117	for type, token, start, end, line in generate_tokens(f.readline):
				118	if type == ENDMARKER:
				119	break
				120	type = tok_name[type]
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame]	121	print "%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals()
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	122
				123	def roundtrip(s):
				124	f = StringIO(s)
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame]	125	source = untokenize(generate_tokens(f.readline))
				126	print source,
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	127
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	128	# This is an example from the docs, set up as a doctest.
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	129	def decistmt(s):
				130	"""Substitute Decimals for floats in a string of statements.
				131
				132	>>> from decimal import Decimal
				133	>>> s = 'print +21.3e-5*-.1234/81.7'
				134	>>> decistmt(s)
				135	"print +Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')"
				136
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	137	The format of the exponent is inherited from the platform C library.
				138	Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
				139	we're only showing 12 digits, and the 13th isn't close to 5, the
				140	rest of the output should be platform-independent.
				141
				142	>>> exec(s) #doctest: +ELLIPSIS
				143	-3.21716034272e-0...7
				144
				145	Output from calculations with Decimal should be identical across all
				146	platforms.
				147
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	148	>>> exec(decistmt(s))
				149	-3.217160342717258261933904529E-7
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	150	"""
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	151
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	152	result = []
				153	g = generate_tokens(StringIO(s).readline) # tokenize the string
				154	for toknum, tokval, _, _, _ in g:
				155	if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
				156	result.extend([
				157	(NAME, 'Decimal'),
				158	(OP, '('),
				159	(STRING, repr(tokval)),
				160	(OP, ')')
				161	])
				162	else:
				163	result.append((toknum, tokval))
				164	return untokenize(result)
				165
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	166	def test_main():
				167	if verbose:
				168	print 'starting...'
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	169
Neal Norwitz	c1120b4	2006-09-02 19:40:19 +0000	[diff] [blame]	170	next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
				171
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	172	# This displays the tokenization of tokenize_tests.py to stdout, and
				173	# regrtest.py checks that this equals the expected output (in the
				174	# test/output/ directory).
				175	f = open(findfile('tokenize_tests' + os.extsep + 'txt'))
				176	tokenize(f.readline)
				177	f.close()
				178
				179	# Now run test_roundtrip() over tokenize_test.py too, and over all
				180	# (if the "compiler" resource is enabled) or a small random sample (if
				181	# "compiler" is not enabled) of the test*.py files.
				182	f = findfile('tokenize_tests' + os.extsep + 'txt')
				183	test_roundtrip(f)
				184
				185	testdir = os.path.dirname(f) or os.curdir
				186	testfiles = glob.glob(testdir + os.sep + 'test*.py')
				187	if not is_resource_enabled('compiler'):
				188	testfiles = random.sample(testfiles, 10)
				189
				190	for f in testfiles:
Neal Norwitz	c1120b4	2006-09-02 19:40:19 +0000	[diff] [blame]	191	# Print still working message since this test can be really slow
				192	if next_time <= time.time():
				193	next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
				194	print >>sys.__stdout__, ' test_main still working, be patient...'
				195	sys.__stdout__.flush()
				196
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	197	test_roundtrip(f)
				198
				199	# Test detecton of IndentationError.
				200	sampleBadText = """\
				201	def foo():
				202	bar
				203	baz
				204	"""
				205
				206	try:
				207	for tok in generate_tokens(StringIO(sampleBadText).readline):
				208	pass
				209	except IndentationError:
				210	pass
				211	else:
				212	raise TestFailed("Did not detect IndentationError:")
				213
				214	# Run the doctests in this module.
				215	from test import test_tokenize # i.e., this module
				216	from test.test_support import run_doctest
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	217	run_doctest(test_tokenize, verbose)
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	218
				219	if verbose:
				220	print 'finished'
				221
				222	if __name__ == "__main__":
				223	test_main()