Blame - Lib/test/test_tokenize.py - platform/external/python/cpython3

blob: 788a04b989faeb2cb888980880b3ac56e1b733e1 [file] [log] [blame]

Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	1	"""Tests for the tokenize module.
				2
				3	The tests were originally written in the old Python style, where the
				4	test output was compared to a golden file. This docstring represents
				5	the first steps towards rewriting the entire test as a doctest.
				6
				7	The tests can be really simple. Given a small fragment of source
				8	code, print out a table with the tokens. The ENDMARK is omitted for
				9	brevity.
				10
				11	>>> dump_tokens("1 + 1")
				12	NUMBER '1' (1, 0) (1, 1)
				13	OP '+' (1, 2) (1, 3)
				14	NUMBER '1' (1, 4) (1, 5)
				15
				16	A comment generates a token here, unlike in the parser module. The
				17	comment token is followed by an NL or a NEWLINE token, depending on
				18	whether the line contains the completion of a statement.
				19
				20	>>> dump_tokens("if False:\\n"
				21	... " # NL\\n"
Guido van Rossum	e7ba495	2007-06-06 23:52:48 +0000	[diff] [blame]	22	... " a = False # NEWLINE\\n")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	23	NAME 'if' (1, 0) (1, 2)
				24	NAME 'False' (1, 3) (1, 8)
				25	OP ':' (1, 8) (1, 9)
				26	NEWLINE '\\n' (1, 9) (1, 10)
				27	COMMENT '# NL' (2, 4) (2, 8)
				28	NL '\\n' (2, 8) (2, 9)
				29	INDENT ' ' (3, 0) (3, 4)
Guido van Rossum	e7ba495	2007-06-06 23:52:48 +0000	[diff] [blame]	30	NAME 'a' (3, 4) (3, 5)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	31	OP '=' (3, 9) (3, 10)
				32	NAME 'False' (3, 11) (3, 16)
				33	COMMENT '# NEWLINE' (3, 17) (3, 26)
				34	NEWLINE '\\n' (3, 26) (3, 27)
				35	DEDENT '' (4, 0) (4, 0)
				36
Guido van Rossum	a6bcefc	2007-08-01 18:06:13 +0000	[diff] [blame]	37	' # Emacs hint
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	38
				39	There will be a bunch more tests of specific source patterns.
				40
				41	The tokenize module also defines an untokenize function that should
				42	regenerate the original program text from the tokens.
				43
				44	There are some standard formatting practices that are easy to get right.
				45
				46	>>> roundtrip("if x == 1:\\n"
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	47	... " print(x)\\n")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	48	if x == 1:
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	49	print(x)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	50
				51	Some people use different formatting conventions, which makes
				52	untokenize a little trickier. Note that this test involves trailing
				53	whitespace after the colon. Note that we use hex escapes to make the
				54	two trailing blanks apparent in the expected output.
				55
				56	>>> roundtrip("if x == 1 : \\n"
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	57	... " print(x)\\n")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	58	if x == 1 :\x20\x20
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	59	print(x)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	60
				61	Comments need to go in the right place.
				62
				63	>>> roundtrip("if x == 1:\\n"
				64	... " # A comment by itself.\\n"
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	65	... " print(x) # Comment here, too.\\n"
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	66	... " # Another comment.\\n"
				67	... "after_if = True\\n")
				68	if x == 1:
				69	# A comment by itself.
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	70	print(x) # Comment here, too.
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	71	# Another comment.
				72	after_if = True
				73
				74	>>> roundtrip("if (x # The comments need to go in the right place\\n"
				75	... " == 1):\\n"
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	76	... " print('x == 1')\\n")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	77	if (x # The comments need to go in the right place
				78	== 1):
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	79	print('x == 1')
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	80
				81	"""
				82
Guido van Rossum	cfbbf48	2007-08-04 17:43:15 +0000	[diff] [blame^]	83	# ' Emacs hint
				84
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	85	import os, glob, random, time, sys
Guido van Rossum	cfbbf48	2007-08-04 17:43:15 +0000	[diff] [blame^]	86	import re
Guido van Rossum	c43e79f	2007-06-18 18:26:36 +0000	[diff] [blame]	87	from io import StringIO
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	88	from test.test_support import (verbose, findfile, is_resource_enabled,
				89	TestFailed)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	90	from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
				91	ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)
				92
				93	# How much time in seconds can pass before we print a 'Still working' message.
				94	_PRINT_WORKING_MSG_INTERVAL = 5 * 60
Guido van Rossum	0874f7f	1997-10-27 22:15:06 +0000	[diff] [blame]	95
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	96	# Test roundtrip for `untokenize`. `f` is a file path. The source code in f
				97	# is tokenized, converted back to source code via tokenize.untokenize(),
				98	# and tokenized again from the latter. The test fails if the second
				99	# tokenization doesn't match the first.
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	100	def test_roundtrip(f):
				101	## print 'Testing:', f
Guido van Rossum	cfbbf48	2007-08-04 17:43:15 +0000	[diff] [blame^]	102	# Get the encoding first
				103	fobj = open(f, encoding="latin-1")
				104	first2lines = fobj.readline() + fobj.readline()
				105	fobj.close()
				106	m = re.search(r"coding:\s*(\S+)", first2lines)
				107	if m:
				108	encoding = m.group(1)
				109	print(" coding:", encoding)
				110	else:
				111	encoding = "utf-8"
				112	fobj = open(f, encoding=encoding)
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	113	try:
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	114	fulltok = list(generate_tokens(fobj.readline))
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	115	finally:
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	116	fobj.close()
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	117
				118	t1 = [tok[:2] for tok in fulltok]
				119	newtext = untokenize(t1)
Georg Brandl	a18af4e	2007-04-21 15:47:16 +0000	[diff] [blame]	120	readline = iter(newtext.splitlines(1)).__next__
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	121	t2 = [tok[:2] for tok in generate_tokens(readline)]
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	122	if t1 != t2:
				123	raise TestFailed("untokenize() roundtrip failed for %r" % f)
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	124
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	125	def dump_tokens(s):
				126	"""Print out the tokens in s in a table format.
				127
				128	The ENDMARKER is omitted.
				129	"""
				130	f = StringIO(s)
				131	for type, token, start, end, line in generate_tokens(f.readline):
				132	if type == ENDMARKER:
				133	break
				134	type = tok_name[type]
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	135	print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	136
				137	def roundtrip(s):
				138	f = StringIO(s)
				139	source = untokenize(generate_tokens(f.readline))
Guido van Rossum	0bcbb0d	2007-02-09 22:43:10 +0000	[diff] [blame]	140	print(source, end="")
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	141
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	142	# This is an example from the docs, set up as a doctest.
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	143	def decistmt(s):
				144	"""Substitute Decimals for floats in a string of statements.
				145
				146	>>> from decimal import Decimal
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	147	>>> s = 'print(+21.3e-5*-.1234/81.7)'
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	148	>>> decistmt(s)
Georg Brandl	88fc664	2007-02-09 21:28:07 +0000	[diff] [blame]	149	"print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	150
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	151	The format of the exponent is inherited from the platform C library.
				152	Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
				153	we're only showing 12 digits, and the 13th isn't close to 5, the
				154	rest of the output should be platform-independent.
				155
				156	>>> exec(s) #doctest: +ELLIPSIS
				157	-3.21716034272e-0...7
				158
				159	Output from calculations with Decimal should be identical across all
				160	platforms.
				161
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	162	>>> exec(decistmt(s))
				163	-3.217160342717258261933904529E-7
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	164	"""
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	165
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	166	result = []
				167	g = generate_tokens(StringIO(s).readline) # tokenize the string
				168	for toknum, tokval, _, _, _ in g:
				169	if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
				170	result.extend([
				171	(NAME, 'Decimal'),
				172	(OP, '('),
				173	(STRING, repr(tokval)),
				174	(OP, ')')
				175	])
				176	else:
				177	result.append((toknum, tokval))
				178	return untokenize(result)
				179
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	180	def test_main():
				181	if verbose:
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	182	print('starting...')
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	183
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	184	next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
				185
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	186	# This displays the tokenization of tokenize_tests.py to stdout, and
				187	# regrtest.py checks that this equals the expected output (in the
				188	# test/output/ directory).
				189	f = open(findfile('tokenize_tests' + os.extsep + 'txt'))
				190	tokenize(f.readline)
				191	f.close()
				192
				193	# Now run test_roundtrip() over tokenize_test.py too, and over all
				194	# (if the "compiler" resource is enabled) or a small random sample (if
				195	# "compiler" is not enabled) of the test*.py files.
				196	f = findfile('tokenize_tests' + os.extsep + 'txt')
				197	test_roundtrip(f)
				198
				199	testdir = os.path.dirname(f) or os.curdir
				200	testfiles = glob.glob(testdir + os.sep + 'test*.py')
				201	if not is_resource_enabled('compiler'):
				202	testfiles = random.sample(testfiles, 10)
				203
				204	for f in testfiles:
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	205	# Print still working message since this test can be really slow
Guido van Rossum	c43e79f	2007-06-18 18:26:36 +0000	[diff] [blame]	206	if verbose:
				207	print(' round trip: ', f, file=sys.__stdout__)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	208	if next_time <= time.time():
				209	next_time = time.time() + _PRINT_WORKING_MSG_INTERVAL
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	210	print(' test_main still working, be patient...', file=sys.__stdout__)
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	211	sys.__stdout__.flush()
				212
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	213	test_roundtrip(f)
				214
				215	# Test detecton of IndentationError.
				216	sampleBadText = """\
				217	def foo():
				218	bar
				219	baz
				220	"""
				221
				222	try:
				223	for tok in generate_tokens(StringIO(sampleBadText).readline):
				224	pass
				225	except IndentationError:
				226	pass
				227	else:
				228	raise TestFailed("Did not detect IndentationError:")
				229
				230	# Run the doctests in this module.
				231	from test import test_tokenize # i.e., this module
				232	from test.test_support import run_doctest
Thomas Wouters	89f507f	2006-12-13 04:49:30 +0000	[diff] [blame]	233	run_doctest(test_tokenize, verbose)
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	234
				235	if verbose:
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	236	print('finished')
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	237
Neal Norwitz	c150536	2006-12-28 06:47:50 +0000	[diff] [blame]	238	def test_rarrow():
				239	"""
				240	This function exists solely to test the tokenization of the RARROW
				241	operator.
				242
Georg Brandl	a18af4e	2007-04-21 15:47:16 +0000	[diff] [blame]	243	>>> tokenize(iter(['->']).__next__) #doctest: +NORMALIZE_WHITESPACE
Neal Norwitz	c150536	2006-12-28 06:47:50 +0000	[diff] [blame]	244	1,0-1,2:\tOP\t'->'
				245	2,0-2,0:\tENDMARKER\t''
				246	"""
				247
Thomas Wouters	49fd7fa	2006-04-21 10:40:58 +0000	[diff] [blame]	248	if __name__ == "__main__":
				249	test_main()