Blame - Lib/test/test_tokenize.py - platform/external/python/cpython3

blob: 022b65821ee912110de635c09acb05c785a94a66 [file] [log] [blame]

Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	1	"""Tests for the tokenize module.
				2
				3	The tests were originally written in the old Python style, where the
				4	test output was compared to a golden file. This docstring represents
				5	the first steps towards rewriting the entire test as a doctest.
				6
				7	The tests can be really simple. Given a small fragment of source
				8	code, print out a table with the tokens. The ENDMARK is omitted for
				9	brevity.
				10
				11	>>> dump_tokens("1 + 1")
				12	NUMBER '1' (1, 0) (1, 1)
				13	OP '+' (1, 2) (1, 3)
				14	NUMBER '1' (1, 4) (1, 5)
				15
				16	A comment generates a token here, unlike in the parser module. The
				17	comment token is followed by an NL or a NEWLINE token, depending on
				18	whether the line contains the completion of a statement.
				19
				20	>>> dump_tokens("if False:\\n"
				21	... " # NL\\n"
Guido van Rossum	e7ba495	2007-06-06 23:52:48 +0000	[diff] [blame]	22	... " a = False # NEWLINE\\n")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	23	NAME 'if' (1, 0) (1, 2)
				24	NAME 'False' (1, 3) (1, 8)
				25	OP ':' (1, 8) (1, 9)
				26	NEWLINE '\\n' (1, 9) (1, 10)
				27	COMMENT '# NL' (2, 4) (2, 8)
				28	NL '\\n' (2, 8) (2, 9)
				29	INDENT ' ' (3, 0) (3, 4)
Guido van Rossum	e7ba495	2007-06-06 23:52:48 +0000	[diff] [blame]	30	NAME 'a' (3, 4) (3, 5)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	31	OP '=' (3, 9) (3, 10)
				32	NAME 'False' (3, 11) (3, 16)
				33	COMMENT '# NEWLINE' (3, 17) (3, 26)
				34	NEWLINE '\\n' (3, 26) (3, 27)
				35	DEDENT '' (4, 0) (4, 0)
				36
Guido van Rossum	a6bcefc	2007-08-01 18:06:13 +0000	[diff] [blame^]	37	' # Emacs hint
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	38
				39	There will be a bunch more tests of specific source patterns.
				40
				41	The tokenize module also defines an untokenize function that should
				42	regenerate the original program text from the tokens.
				43
				44	There are some standard formatting practices that are easy to get right.
				45
				46	>>> roundtrip("if x == 1:\\n"
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	47	... " print(x)\\n")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	48	if x == 1:
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	49	print(x)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	50
				51	Some people use different formatting conventions, which makes
				52	untokenize a little trickier. Note that this test involves trailing
				53	whitespace after the colon. Note that we use hex escapes to make the
				54	two trailing blanks apparent in the expected output.
				55
				56	>>> roundtrip("if x == 1 : \\n"
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	57	... " print(x)\\n")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	58	if x == 1 :\x20\x20
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	59	print(x)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	60
				61	Comments need to go in the right place.
				62
				63	>>> roundtrip("if x == 1:\\n"
				64	... " # A comment by itself.\\n"
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	65	... " print(x) # Comment here, too.\\n"
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	66	... " # Another comment.\\n"
				67	... "after_if = True\\n")
				68	if x == 1:
				69	# A comment by itself.
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	70	print(x) # Comment here, too.
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	71	# Another comment.
				72	after_if = True
				73
				74	>>> roundtrip("if (x # The comments need to go in the right place\\n"
				75	... " == 1):\\n"
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	76	... " print('x == 1')\\n")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	77	if (x # The comments need to go in the right place
				78	== 1):
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	79	print('x == 1')
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	80
				81	"""
				82
				83	import os, glob, random, time, sys
Guido van Rossum	c43e79f	2007-06-18 18:26:36 +0000	[diff] [blame]	84	from io import StringIO
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	85	from test.test_support import (verbose, findfile, is_resource_enabled,
				86	TestFailed)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	87	from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
				88	ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)
				89
				90	# How much time in seconds can pass before we print a 'Still working' message.
				91	_PRINT_WORKING_MSG_INTERVAL = 5 * 60
Guido van Rossum	0874f7f	1997-10-27 22:15:06 +0000	[diff] [blame]	92
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	93	# Test roundtrip for `untokenize`. `f` is a file path. The source code in f
				94	# is tokenized, converted back to source code via tokenize.untokenize(),
				95	# and tokenized again from the latter. The test fails if the second
				96	# tokenization doesn't match the first.
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	97	def test_roundtrip(f):
				98	## print 'Testing:', f
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	99	fobj = open(f)
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	100	try:
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	101	fulltok = list(generate_tokens(fobj.readline))
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	102	finally:
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	103	fobj.close()
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	104
				105	t1 = [tok[:2] for tok in fulltok]
				106	newtext = untokenize(t1)
Georg Brandl	a18af4e	2007-04-21 15:47:16 +0000	[diff] [blame]	107	readline = iter(newtext.splitlines(1)).__next__
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	108	t2 = [tok[:2] for tok in generate_tokens(readline)]
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	109	if t1 != t2:
				110	raise TestFailed("untokenize() roundtrip failed for %r" % f)
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	111
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	112	def dump_tokens(s):
				113	"""Print out the tokens in s in a table format.
				114
				115	The ENDMARKER is omitted.
				116	"""
				117	f = StringIO(s)
				118	for type, token, start, end, line in generate_tokens(f.readline):
				119	if type == ENDMARKER:
				120	break
				121	type = tok_name[type]
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	122	print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	123
				124	def roundtrip(s):
				125	f = StringIO(s)
				126	source = untokenize(generate_tokens(f.readline))
Guido van Rossum	0bcbb0d	2007-02-09 22:43:10 +0000	[diff] [blame]	127	print(source, end="")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	128
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	129	# This is an example from the docs, set up as a doctest.
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	130	def decistmt(s):
				131	"""Substitute Decimals for floats in a string of statements.
				132
				133	>>> from decimal import Decimal
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	134	>>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	135	>>> decistmt(s)
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	136	"print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	137
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	138	The format of the exponent is inherited from the platform C library.
				139	Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
				140	we're only showing 12 digits, and the 13th isn't close to 5, the
				141	rest of the output should be platform-independent.
				142
				143	>>> exec(s) #doctest: +ELLIPSIS
				144	-3.21716034272e-0...7
				145
				146	Output from calculations with Decimal should be identical across all
				147	platforms.
				148
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	149	>>> exec(decistmt(s))
				150	-3.217160342717258261933904529E-7
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	151	"""
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	152
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	153	result = []
				154	g = generate_tokens(StringIO(s).readline) # tokenize the string
				155	for toknum, tokval, _, _, _ in g:
				156	if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
				157	result.extend([
				158	(NAME, 'Decimal'),
				159	(OP, '('),
				160	(STRING, repr(tokval)),
				161	(OP, ')')
				162	])
				163	else:
				164	result.append((toknum, tokval))
				165	return untokenize(result)
				166
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	167	def test_main():
				168	if verbose:
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	169	print('starting...')
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	170
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	171	next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
				172
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	173	# This displays the tokenization of tokenize_tests.py to stdout, and
				174	# regrtest.py checks that this equals the expected output (in the
				175	# test/output/ directory).
				176	f = open(findfile('tokenize_tests' + os.extsep + 'txt'))
				177	tokenize(f.readline)
				178	f.close()
				179
				180	# Now run test_roundtrip() over tokenize_test.py too, and over all
				181	# (if the "compiler" resource is enabled) or a small random sample (if
				182	# "compiler" is not enabled) of the test*.py files.
				183	f = findfile('tokenize_tests' + os.extsep + 'txt')
				184	test_roundtrip(f)
				185
				186	testdir = os.path.dirname(f) or os.curdir
				187	testfiles = glob.glob(testdir + os.sep + 'test*.py')
Guido van Rossum	a6bcefc	2007-08-01 18:06:13 +0000	[diff] [blame^]	188	# Exclude test_pep263 which is encoded in KOI8-R
				189	testfiles = [t for t in testfiles if not t.endswith("pep263.py")]
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	190	if not is_resource_enabled('compiler'):
				191	testfiles = random.sample(testfiles, 10)
				192
				193	for f in testfiles:
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	194	# Print still working message since this test can be really slow
Guido van Rossum	c43e79f	2007-06-18 18:26:36 +0000	[diff] [blame]	195	if verbose:
				196	print(' round trip: ', f, file=sys.__stdout__)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	197	if next_time <= time.time():
				198	next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	199	print(' test_main still working, be patient...', file=sys.__stdout__)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	200	sys.__stdout__.flush()
				201
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	202	test_roundtrip(f)
				203
				204	# Test detecton of IndentationError.
				205	sampleBadText = """\
				206	def foo():
				207	bar
				208	baz
				209	"""
				210
				211	try:
				212	for tok in generate_tokens(StringIO(sampleBadText).readline):
				213	pass
				214	except IndentationError:
				215	pass
				216	else:
				217	raise TestFailed("Did not detect IndentationError:")
				218
				219	# Run the doctests in this module.
				220	from test import test_tokenize # i.e., this module
				221	from test.test_support import run_doctest
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	222	run_doctest(test_tokenize, verbose)
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	223
				224	if verbose:
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	225	print('finished')
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	226
Neal Norwitz	c150536	2006-12-28 06:47:50 +0000	[diff] [blame]	227	def test_rarrow():
				228	"""
				229	This function exists solely to test the tokenization of the RARROW
				230	operator.
				231
Georg Brandl	a18af4e	2007-04-21 15:47:16 +0000	[diff] [blame]	232	>>> tokenize(iter(['->']).__next__) #doctest: +NORMALIZE_WHITESPACE
Neal Norwitz	c150536	2006-12-28 06:47:50 +0000	[diff] [blame]	233	1,0-1,2:\tOP\t'->'
				234	2,0-2,0:\tENDMARKER\t''
				235	"""
				236
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	237	if __name__ == "__main__":
				238	test_main()