Blame - Lib/test/test_tokenize.py - platform/external/python/cpython3

blob: e59d9c672cff69086b62c0edc9c4eb3cef48a24d [file] [log] [blame]

Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	1	"""Tests for the tokenize module.
				2
				3	The tests were originally written in the old Python style, where the
				4	test output was compared to a golden file. This docstring represents
				5	the first steps towards rewriting the entire test as a doctest.
				6
				7	The tests can be really simple. Given a small fragment of source
				8	code, print out a table with the tokens. The ENDMARK is omitted for
				9	brevity.
				10
				11	>>> dump_tokens("1 + 1")
				12	NUMBER '1' (1, 0) (1, 1)
				13	OP '+' (1, 2) (1, 3)
				14	NUMBER '1' (1, 4) (1, 5)
				15
				16	A comment generates a token here, unlike in the parser module. The
				17	comment token is followed by an NL or a NEWLINE token, depending on
				18	whether the line contains the completion of a statement.
				19
				20	>>> dump_tokens("if False:\\n"
				21	... " # NL\\n"
Guido van Rossum	e7ba495	2007-06-06 23:52:48 +0000	[diff] [blame]	22	... " a = False # NEWLINE\\n")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	23	NAME 'if' (1, 0) (1, 2)
				24	NAME 'False' (1, 3) (1, 8)
				25	OP ':' (1, 8) (1, 9)
				26	NEWLINE '\\n' (1, 9) (1, 10)
				27	COMMENT '# NL' (2, 4) (2, 8)
				28	NL '\\n' (2, 8) (2, 9)
				29	INDENT ' ' (3, 0) (3, 4)
Guido van Rossum	e7ba495	2007-06-06 23:52:48 +0000	[diff] [blame]	30	NAME 'a' (3, 4) (3, 5)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	31	OP '=' (3, 9) (3, 10)
				32	NAME 'False' (3, 11) (3, 16)
				33	COMMENT '# NEWLINE' (3, 17) (3, 26)
				34	NEWLINE '\\n' (3, 26) (3, 27)
				35	DEDENT '' (4, 0) (4, 0)
				36
Guido van Rossum	a6bcefc	2007-08-01 18:06:13 +0000	[diff] [blame]	37	' # Emacs hint
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	38
				39	There will be a bunch more tests of specific source patterns.
				40
				41	The tokenize module also defines an untokenize function that should
				42	regenerate the original program text from the tokens.
				43
				44	There are some standard formatting practices that are easy to get right.
				45
				46	>>> roundtrip("if x == 1:\\n"
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	47	... " print(x)\\n")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	48	if x == 1:
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	49	print(x)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	50
				51	Some people use different formatting conventions, which makes
				52	untokenize a little trickier. Note that this test involves trailing
				53	whitespace after the colon. Note that we use hex escapes to make the
				54	two trailing blanks apparent in the expected output.
				55
				56	>>> roundtrip("if x == 1 : \\n"
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	57	... " print(x)\\n")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	58	if x == 1 :\x20\x20
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	59	print(x)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	60
				61	Comments need to go in the right place.
				62
				63	>>> roundtrip("if x == 1:\\n"
				64	... " # A comment by itself.\\n"
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	65	... " print(x) # Comment here, too.\\n"
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	66	... " # Another comment.\\n"
				67	... "after_if = True\\n")
				68	if x == 1:
				69	# A comment by itself.
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	70	print(x) # Comment here, too.
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	71	# Another comment.
				72	after_if = True
				73
				74	>>> roundtrip("if (x # The comments need to go in the right place\\n"
				75	... " == 1):\\n"
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	76	... " print('x == 1')\\n")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	77	if (x # The comments need to go in the right place
				78	== 1):
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	79	print('x == 1')
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	80
				81	"""
				82
Guido van Rossum	cfbbf48	2007-08-04 17:43:15 +0000	[diff] [blame]	83	# ' Emacs hint
				84
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	85	import os, glob, random, time, sys
Guido van Rossum	cfbbf48	2007-08-04 17:43:15 +0000	[diff] [blame]	86	import re
Guido van Rossum	c43e79f	2007-06-18 18:26:36 +0000	[diff] [blame]	87	from io import StringIO
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	88	from test.test_support import (verbose, findfile, is_resource_enabled,
				89	TestFailed)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	90	from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
				91	ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)
				92
				93	# How much time in seconds can pass before we print a 'Still working' message.
				94	_PRINT_WORKING_MSG_INTERVAL = 5 * 60
Guido van Rossum	0874f7f	1997-10-27 22:15:06 +0000	[diff] [blame]	95
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	96	# Test roundtrip for `untokenize`. `f` is a file path. The source code in f
				97	# is tokenized, converted back to source code via tokenize.untokenize(),
				98	# and tokenized again from the latter. The test fails if the second
				99	# tokenization doesn't match the first.
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	100	def test_roundtrip(f):
Guido van Rossum	f066c1b	2007-08-04 17:55:43 +0000	[diff] [blame]	101	## print('Testing:', f)
Guido van Rossum	cfbbf48	2007-08-04 17:43:15 +0000	[diff] [blame]	102	# Get the encoding first
				103	fobj = open(f, encoding="latin-1")
				104	first2lines = fobj.readline() + fobj.readline()
				105	fobj.close()
				106	m = re.search(r"coding:\s*(\S+)", first2lines)
				107	if m:
				108	encoding = m.group(1)
Guido van Rossum	f066c1b	2007-08-04 17:55:43 +0000	[diff] [blame]	109	## print(" coding:", encoding)
Guido van Rossum	cfbbf48	2007-08-04 17:43:15 +0000	[diff] [blame]	110	else:
				111	encoding = "utf-8"
				112	fobj = open(f, encoding=encoding)
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	113	try:
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	114	fulltok = list(generate_tokens(fobj.readline))
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	115	finally:
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	116	fobj.close()
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	117
				118	t1 = [tok[:2] for tok in fulltok]
				119	newtext = untokenize(t1)
Georg Brandl	a18af4e	2007-04-21 15:47:16 +0000	[diff] [blame]	120	readline = iter(newtext.splitlines(1)).__next__
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	121	t2 = [tok[:2] for tok in generate_tokens(readline)]
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	122	if t1 != t2:
				123	raise TestFailed("untokenize() roundtrip failed for %r" % f)
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	124
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	125	def dump_tokens(s):
				126	"""Print out the tokens in s in a table format.
				127
				128	The ENDMARKER is omitted.
				129	"""
				130	f = StringIO(s)
				131	for type, token, start, end, line in generate_tokens(f.readline):
				132	if type == ENDMARKER:
				133	break
				134	type = tok_name[type]
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	135	print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	136
				137	def roundtrip(s):
				138	f = StringIO(s)
				139	source = untokenize(generate_tokens(f.readline))
Guido van Rossum	0bcbb0d	2007-02-09 22:43:10 +0000	[diff] [blame]	140	print(source, end="")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	141
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	142	# This is an example from the docs, set up as a doctest.
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	143	def decistmt(s):
				144	"""Substitute Decimals for floats in a string of statements.
				145
				146	>>> from decimal import Decimal
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	147	>>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	148	>>> decistmt(s)
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	149	"print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	150
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	151	The format of the exponent is inherited from the platform C library.
				152	Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
				153	we're only showing 12 digits, and the 13th isn't close to 5, the
				154	rest of the output should be platform-independent.
				155
				156	>>> exec(s) #doctest: +ELLIPSIS
				157	-3.21716034272e-0...7
				158
				159	Output from calculations with Decimal should be identical across all
				160	platforms.
				161
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	162	>>> exec(decistmt(s))
				163	-3.217160342717258261933904529E-7
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	164	"""
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	165
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	166	result = []
				167	g = generate_tokens(StringIO(s).readline) # tokenize the string
				168	for toknum, tokval, _, _, _ in g:
				169	if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
				170	result.extend([
				171	(NAME, 'Decimal'),
				172	(OP, '('),
				173	(STRING, repr(tokval)),
				174	(OP, ')')
				175	])
				176	else:
				177	result.append((toknum, tokval))
				178	return untokenize(result)
				179
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	180	def test_main():
				181	if verbose:
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	182	print('starting...')
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	183
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	184	next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
				185
Guido van Rossum	4fe72f9	2007-11-12 17:40:10 +0000	[diff] [blame]	186	# Validate the tokenize_tests.txt file.
				187	# This makes sure it compiles, and displays any errors in it.
				188	f = open(findfile('tokenize_tests.txt'))
				189	sf = f.read()
				190	f.close()
				191	cf = compile(sf, 'tokenize_tests.txt', 'exec')
				192
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	193	# This displays the tokenization of tokenize_tests.py to stdout, and
				194	# regrtest.py checks that this equals the expected output (in the
				195	# test/output/ directory).
Skip Montanaro	7a98be2	2007-08-16 14:35:24 +0000	[diff] [blame]	196	f = open(findfile('tokenize_tests.txt'))
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	197	tokenize(f.readline)
				198	f.close()
				199
Guido van Rossum	4fe72f9	2007-11-12 17:40:10 +0000	[diff] [blame]	200	# Now run test_roundtrip() over test_tokenize.py too, and over all
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	201	# (if the "compiler" resource is enabled) or a small random sample (if
				202	# "compiler" is not enabled) of the test*.py files.
Guido van Rossum	4fe72f9	2007-11-12 17:40:10 +0000	[diff] [blame]	203	f = findfile('test_tokenize.py')
				204	if verbose:
				205	print(' round trip: ', f, file=sys.__stdout__)
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	206	test_roundtrip(f)
				207
				208	testdir = os.path.dirname(f) or os.curdir
				209	testfiles = glob.glob(testdir + os.sep + 'test*.py')
				210	if not is_resource_enabled('compiler'):
				211	testfiles = random.sample(testfiles, 10)
				212
				213	for f in testfiles:
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	214	# Print still working message since this test can be really slow
Guido van Rossum	c43e79f	2007-06-18 18:26:36 +0000	[diff] [blame]	215	if verbose:
				216	print(' round trip: ', f, file=sys.__stdout__)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	217	if next_time <= time.time():
				218	next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	219	print(' test_main still working, be patient...', file=sys.__stdout__)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	220	sys.__stdout__.flush()
				221
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	222	test_roundtrip(f)
				223
				224	# Test detecton of IndentationError.
				225	sampleBadText = """\
				226	def foo():
				227	bar
				228	baz
				229	"""
				230
				231	try:
				232	for tok in generate_tokens(StringIO(sampleBadText).readline):
				233	pass
				234	except IndentationError:
				235	pass
				236	else:
				237	raise TestFailed("Did not detect IndentationError:")
				238
				239	# Run the doctests in this module.
				240	from test import test_tokenize # i.e., this module
				241	from test.test_support import run_doctest
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	242	run_doctest(test_tokenize, verbose)
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	243
				244	if verbose:
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	245	print('finished')
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	246
Neal Norwitz	c150536	2006-12-28 06:47:50 +0000	[diff] [blame]	247	def test_rarrow():
				248	"""
				249	This function exists solely to test the tokenization of the RARROW
				250	operator.
				251
Georg Brandl	a18af4e	2007-04-21 15:47:16 +0000	[diff] [blame]	252	>>> tokenize(iter(['->']).__next__) #doctest: +NORMALIZE_WHITESPACE
Neal Norwitz	c150536	2006-12-28 06:47:50 +0000	[diff] [blame]	253	1,0-1,2:\tOP\t'->'
				254	2,0-2,0:\tENDMARKER\t''
				255	"""
				256
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	257	if __name__ == "__main__":
				258	test_main()