Blame - Lib/test/test_tokenize.py - platform/external/python/cpython3

blob: 4d8d9f14da821400bc1972c7a58d615e997486d1 [file] [log] [blame]

Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	1	"""Tests for the tokenize module.
				2
				3	The tests were originally written in the old Python style, where the
				4	test output was compared to a golden file. This docstring represents
				5	the first steps towards rewriting the entire test as a doctest.
				6
				7	The tests can be really simple. Given a small fragment of source
				8	code, print out a table with the tokens. The ENDMARK is omitted for
				9	brevity.
				10
				11	>>> dump_tokens("1 + 1")
				12	NUMBER '1' (1, 0) (1, 1)
				13	OP '+' (1, 2) (1, 3)
				14	NUMBER '1' (1, 4) (1, 5)
				15
				16	A comment generates a token here, unlike in the parser module. The
				17	comment token is followed by an NL or a NEWLINE token, depending on
				18	whether the line contains the completion of a statement.
				19
				20	>>> dump_tokens("if False:\\n"
				21	... " # NL\\n"
Guido van Rossum	e7ba495	2007-06-06 23:52:48 +0000	[diff] [blame]	22	... " a = False # NEWLINE\\n")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	23	NAME 'if' (1, 0) (1, 2)
				24	NAME 'False' (1, 3) (1, 8)
				25	OP ':' (1, 8) (1, 9)
				26	NEWLINE '\\n' (1, 9) (1, 10)
				27	COMMENT '# NL' (2, 4) (2, 8)
				28	NL '\\n' (2, 8) (2, 9)
				29	INDENT ' ' (3, 0) (3, 4)
Guido van Rossum	e7ba495	2007-06-06 23:52:48 +0000	[diff] [blame]	30	NAME 'a' (3, 4) (3, 5)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	31	OP '=' (3, 9) (3, 10)
				32	NAME 'False' (3, 11) (3, 16)
				33	COMMENT '# NEWLINE' (3, 17) (3, 26)
				34	NEWLINE '\\n' (3, 26) (3, 27)
				35	DEDENT '' (4, 0) (4, 0)
				36
				37
				38	There will be a bunch more tests of specific source patterns.
				39
				40	The tokenize module also defines an untokenize function that should
				41	regenerate the original program text from the tokens.
				42
				43	There are some standard formatting practices that are easy to get right.
				44
				45	>>> roundtrip("if x == 1:\\n"
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	46	... " print(x)\\n")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	47	if x == 1:
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	48	print(x)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	49
				50	Some people use different formatting conventions, which makes
				51	untokenize a little trickier. Note that this test involves trailing
				52	whitespace after the colon. Note that we use hex escapes to make the
				53	two trailing blanks apparent in the expected output.
				54
				55	>>> roundtrip("if x == 1 : \\n"
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	56	... " print(x)\\n")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	57	if x == 1 :\x20\x20
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	58	print(x)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	59
				60	Comments need to go in the right place.
				61
				62	>>> roundtrip("if x == 1:\\n"
				63	... " # A comment by itself.\\n"
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	64	... " print(x) # Comment here, too.\\n"
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	65	... " # Another comment.\\n"
				66	... "after_if = True\\n")
				67	if x == 1:
				68	# A comment by itself.
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	69	print(x) # Comment here, too.
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	70	# Another comment.
				71	after_if = True
				72
				73	>>> roundtrip("if (x # The comments need to go in the right place\\n"
				74	... " == 1):\\n"
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	75	... " print('x == 1')\\n")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	76	if (x # The comments need to go in the right place
				77	== 1):
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	78	print('x == 1')
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	79
				80	"""
				81
				82	import os, glob, random, time, sys
Guido van Rossum	c43e79f	2007-06-18 18:26:36 +0000	[diff] [blame]	83	from io import StringIO
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	84	from test.test_support import (verbose, findfile, is_resource_enabled,
				85	TestFailed)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	86	from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
				87	ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)
				88
				89	# How much time in seconds can pass before we print a 'Still working' message.
				90	_PRINT_WORKING_MSG_INTERVAL = 5 * 60
Guido van Rossum	0874f7f	1997-10-27 22:15:06 +0000	[diff] [blame]	91
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	92	# Test roundtrip for `untokenize`. `f` is a file path. The source code in f
				93	# is tokenized, converted back to source code via tokenize.untokenize(),
				94	# and tokenized again from the latter. The test fails if the second
				95	# tokenization doesn't match the first.
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	96	def test_roundtrip(f):
				97	## print 'Testing:', f
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	98	fobj = open(f)
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	99	try:
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	100	fulltok = list(generate_tokens(fobj.readline))
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	101	finally:
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	102	fobj.close()
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	103
				104	t1 = [tok[:2] for tok in fulltok]
				105	newtext = untokenize(t1)
Georg Brandl	a18af4e	2007-04-21 15:47:16 +0000	[diff] [blame]	106	readline = iter(newtext.splitlines(1)).__next__
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	107	t2 = [tok[:2] for tok in generate_tokens(readline)]
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	108	if t1 != t2:
				109	raise TestFailed("untokenize() roundtrip failed for %r" % f)
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	110
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	111	def dump_tokens(s):
				112	"""Print out the tokens in s in a table format.
				113
				114	The ENDMARKER is omitted.
				115	"""
				116	f = StringIO(s)
				117	for type, token, start, end, line in generate_tokens(f.readline):
				118	if type == ENDMARKER:
				119	break
				120	type = tok_name[type]
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	121	print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	122
				123	def roundtrip(s):
				124	f = StringIO(s)
				125	source = untokenize(generate_tokens(f.readline))
Guido van Rossum	0bcbb0d	2007-02-09 22:43:10 +0000	[diff] [blame]	126	print(source, end="")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	127
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	128	# This is an example from the docs, set up as a doctest.
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	129	def decistmt(s):
				130	"""Substitute Decimals for floats in a string of statements.
				131
				132	>>> from decimal import Decimal
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	133	>>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	134	>>> decistmt(s)
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	135	"print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	136
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	137	The format of the exponent is inherited from the platform C library.
				138	Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
				139	we're only showing 12 digits, and the 13th isn't close to 5, the
				140	rest of the output should be platform-independent.
				141
				142	>>> exec(s) #doctest: +ELLIPSIS
				143	-3.21716034272e-0...7
				144
				145	Output from calculations with Decimal should be identical across all
				146	platforms.
				147
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	148	>>> exec(decistmt(s))
				149	-3.217160342717258261933904529E-7
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	150	"""
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	151
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	152	result = []
				153	g = generate_tokens(StringIO(s).readline) # tokenize the string
				154	for toknum, tokval, _, _, _ in g:
				155	if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
				156	result.extend([
				157	(NAME, 'Decimal'),
				158	(OP, '('),
				159	(STRING, repr(tokval)),
				160	(OP, ')')
				161	])
				162	else:
				163	result.append((toknum, tokval))
				164	return untokenize(result)
				165
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	166	def test_main():
				167	if verbose:
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	168	print('starting...')
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	169
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	170	next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
				171
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	172	# This displays the tokenization of tokenize_tests.py to stdout, and
				173	# regrtest.py checks that this equals the expected output (in the
				174	# test/output/ directory).
				175	f = open(findfile('tokenize_tests' + os.extsep + 'txt'))
				176	tokenize(f.readline)
				177	f.close()
				178
				179	# Now run test_roundtrip() over tokenize_test.py too, and over all
				180	# (if the "compiler" resource is enabled) or a small random sample (if
				181	# "compiler" is not enabled) of the test*.py files.
				182	f = findfile('tokenize_tests' + os.extsep + 'txt')
				183	test_roundtrip(f)
				184
				185	testdir = os.path.dirname(f) or os.curdir
				186	testfiles = glob.glob(testdir + os.sep + 'test*.py')
				187	if not is_resource_enabled('compiler'):
				188	testfiles = random.sample(testfiles, 10)
				189
				190	for f in testfiles:
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	191	# Print still working message since this test can be really slow
Guido van Rossum	c43e79f	2007-06-18 18:26:36 +0000	[diff] [blame]	192	if verbose:
				193	print(' round trip: ', f, file=sys.__stdout__)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	194	if next_time <= time.time():
				195	next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	196	print(' test_main still working, be patient...', file=sys.__stdout__)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	197	sys.__stdout__.flush()
				198
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	199	test_roundtrip(f)
				200
				201	# Test detecton of IndentationError.
				202	sampleBadText = """\
				203	def foo():
				204	bar
				205	baz
				206	"""
				207
				208	try:
				209	for tok in generate_tokens(StringIO(sampleBadText).readline):
				210	pass
				211	except IndentationError:
				212	pass
				213	else:
				214	raise TestFailed("Did not detect IndentationError:")
				215
				216	# Run the doctests in this module.
				217	from test import test_tokenize # i.e., this module
				218	from test.test_support import run_doctest
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	219	run_doctest(test_tokenize, verbose)
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	220
				221	if verbose:
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	222	print('finished')
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	223
Neal Norwitz	c150536	2006-12-28 06:47:50 +0000	[diff] [blame]	224	def test_rarrow():
				225	"""
				226	This function exists solely to test the tokenization of the RARROW
				227	operator.
				228
Georg Brandl	a18af4e	2007-04-21 15:47:16 +0000	[diff] [blame]	229	>>> tokenize(iter(['->']).__next__) #doctest: +NORMALIZE_WHITESPACE
Neal Norwitz	c150536	2006-12-28 06:47:50 +0000	[diff] [blame]	230	1,0-1,2:\tOP\t'->'
				231	2,0-2,0:\tENDMARKER\t''
				232	"""
				233
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	234	if __name__ == "__main__":
				235	test_main()