Blame - Lib/test/test_tokenize.py - platform/external/python/cpython2

blob: 86f1b9b404fda1646fa84e27f8fea8a9126ef510 [file] [log] [blame]

Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	1	"""Tests for the tokenize module.
				2
				3	The tests were originally written in the old Python style, where the
				4	test output was compared to a golden file. This docstring represents
				5	the first steps towards rewriting the entire test as a doctest.
				6
				7	The tests can be really simple. Given a small fragment of source
				8	code, print out a table with the tokens. The ENDMARK is omitted for
				9	brevity.
				10
				11	>>> dump_tokens("1 + 1")
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame^]	12	NUMBER '1' (1, 0) (1, 1)
				13	OP '+' (1, 2) (1, 3)
				14	NUMBER '1' (1, 4) (1, 5)
				15
				16	A comment generates a token here, unlike in the parser module. The
				17	comment token is followed by an NL or a NEWLINE token, depending on
				18	whether the line contains the completion of a statement.
				19
				20	>>> dump_tokens("if False:\\n"
				21	... " # NL\\n"
				22	... " True = False # NEWLINE\\n")
				23	NAME 'if' (1, 0) (1, 2)
				24	NAME 'False' (1, 3) (1, 8)
				25	OP ':' (1, 8) (1, 9)
				26	NEWLINE '\\n' (1, 9) (1, 10)
				27	COMMENT '# NL' (2, 4) (2, 8)
				28	NL '\\n' (2, 8) (2, 9)
				29	INDENT ' ' (3, 0) (3, 4)
				30	NAME 'True' (3, 4) (3, 8)
				31	OP '=' (3, 9) (3, 10)
				32	NAME 'False' (3, 11) (3, 16)
				33	COMMENT '# NEWLINE' (3, 17) (3, 26)
				34	NEWLINE '\\n' (3, 26) (3, 27)
				35	DEDENT '' (4, 0) (4, 0)
				36
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	37
				38	There will be a bunch more tests of specific source patterns.
				39
				40	The tokenize module also defines an untokenize function that should
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame^]	41	regenerate the original program text from the tokens.
				42
				43	There are some standard formatting practices that are easy to get right.
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	44
				45	>>> roundtrip("if x == 1:\\n"
				46	... " print x\\n")
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame^]	47	if x == 1:
				48	print x
				49
				50	Some people use different formatting conventions, which makes
				51	untokenize a little trickier. Note that this test involves trailing
				52	whitespace after the colon. You can't see it, but it's there!
				53
				54	>>> roundtrip("if x == 1 : \\n"
				55	... " print x\\n")
				56	if x == 1 :
				57	print x
				58
				59	Comments need to go in the right place.
				60
				61	>>> roundtrip("if x == 1:\\n"
				62	... " # A comment by itself.\\n"
				63	... " print x # Comment here, too.\\n"
				64	... " # Another comment.\\n"
				65	... "after_if = True\\n")
				66	if x == 1:
				67	# A comment by itself.
				68	print x # Comment here, too.
				69	# Another comment.
				70	after_if = True
				71
				72	>>> roundtrip("if (x # The comments need to go in the right place\\n"
				73	... " == 1):\\n"
				74	... " print 'x == 1'\\n")
				75	if (x # The comments need to go in the right place
				76	== 1):
				77	print 'x == 1'
				78
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	79	"""
				80
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	81	import os, glob, random
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	82	from cStringIO import StringIO
				83	from test.test_support import (verbose, findfile, is_resource_enabled,
				84	TestFailed)
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	85	from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame^]	86	ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)
Guido van Rossum	0874f7f	1997-10-27 22:15:06 +0000	[diff] [blame]	87
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	88	# Test roundtrip for `untokenize`. `f` is a file path. The source code in f
				89	# is tokenized, converted back to source code via tokenize.untokenize(),
				90	# and tokenized again from the latter. The test fails if the second
				91	# tokenization doesn't match the first.
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	92	def test_roundtrip(f):
				93	## print 'Testing:', f
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	94	fobj = open(f)
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	95	try:
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	96	fulltok = list(generate_tokens(fobj.readline))
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	97	finally:
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	98	fobj.close()
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	99
				100	t1 = [tok[:2] for tok in fulltok]
				101	newtext = untokenize(t1)
				102	readline = iter(newtext.splitlines(1)).next
				103	t2 = [tok[:2] for tok in generate_tokens(readline)]
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	104	if t1 != t2:
				105	raise TestFailed("untokenize() roundtrip failed for %r" % f)
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	106
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	107	def dump_tokens(s):
				108	"""Print out the tokens in s in a table format.
				109
				110	The ENDMARKER is omitted.
				111	"""
				112	f = StringIO(s)
				113	for type, token, start, end, line in generate_tokens(f.readline):
				114	if type == ENDMARKER:
				115	break
				116	type = tok_name[type]
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame^]	117	print "%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals()
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	118
				119	def roundtrip(s):
				120	f = StringIO(s)
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame^]	121	source = untokenize(generate_tokens(f.readline))
				122	print source,
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	123
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	124	# This is an example from the docs, set up as a doctest.
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	125	def decistmt(s):
				126	"""Substitute Decimals for floats in a string of statements.
				127
				128	>>> from decimal import Decimal
				129	>>> s = 'print +21.3e-5*-.1234/81.7'
				130	>>> decistmt(s)
				131	"print +Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')"
				132
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	133	The format of the exponent is inherited from the platform C library.
				134	Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
				135	we're only showing 12 digits, and the 13th isn't close to 5, the
				136	rest of the output should be platform-independent.
				137
				138	>>> exec(s) #doctest: +ELLIPSIS
				139	-3.21716034272e-0...7
				140
				141	Output from calculations with Decimal should be identical across all
				142	platforms.
				143
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	144	>>> exec(decistmt(s))
				145	-3.217160342717258261933904529E-7
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	146	"""
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	147
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	148	result = []
				149	g = generate_tokens(StringIO(s).readline) # tokenize the string
				150	for toknum, tokval, _, _, _ in g:
				151	if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
				152	result.extend([
				153	(NAME, 'Decimal'),
				154	(OP, '('),
				155	(STRING, repr(tokval)),
				156	(OP, ')')
				157	])
				158	else:
				159	result.append((toknum, tokval))
				160	return untokenize(result)
				161
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	162	def test_main():
				163	if verbose:
				164	print 'starting...'
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	165
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	166	# This displays the tokenization of tokenize_tests.py to stdout, and
				167	# regrtest.py checks that this equals the expected output (in the
				168	# test/output/ directory).
				169	f = open(findfile('tokenize_tests' + os.extsep + 'txt'))
				170	tokenize(f.readline)
				171	f.close()
				172
				173	# Now run test_roundtrip() over tokenize_test.py too, and over all
				174	# (if the "compiler" resource is enabled) or a small random sample (if
				175	# "compiler" is not enabled) of the test*.py files.
				176	f = findfile('tokenize_tests' + os.extsep + 'txt')
				177	test_roundtrip(f)
				178
				179	testdir = os.path.dirname(f) or os.curdir
				180	testfiles = glob.glob(testdir + os.sep + 'test*.py')
				181	if not is_resource_enabled('compiler'):
				182	testfiles = random.sample(testfiles, 10)
				183
				184	for f in testfiles:
				185	test_roundtrip(f)
				186
				187	# Test detecton of IndentationError.
				188	sampleBadText = """\
				189	def foo():
				190	bar
				191	baz
				192	"""
				193
				194	try:
				195	for tok in generate_tokens(StringIO(sampleBadText).readline):
				196	pass
				197	except IndentationError:
				198	pass
				199	else:
				200	raise TestFailed("Did not detect IndentationError:")
				201
				202	# Run the doctests in this module.
				203	from test import test_tokenize # i.e., this module
				204	from test.test_support import run_doctest
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	205	run_doctest(test_tokenize, verbose)
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	206
				207	if verbose:
				208	print 'finished'
				209
				210	if __name__ == "__main__":
				211	test_main()