Blame - Lib/test/test_tokenize.py - platform/external/python/cpython2

blob: 3fa792746ee8369cc460b5b924bdeb06143d234e [file] [log] [blame]

Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	1	"""Tests for the tokenize module.
				2
				3	The tests were originally written in the old Python style, where the
				4	test output was compared to a golden file. This docstring represents
				5	the first steps towards rewriting the entire test as a doctest.
				6
				7	The tests can be really simple. Given a small fragment of source
				8	code, print out a table with the tokens. The ENDMARK is omitted for
				9	brevity.
				10
				11	>>> dump_tokens("1 + 1")
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame]	12	NUMBER '1' (1, 0) (1, 1)
				13	OP '+' (1, 2) (1, 3)
				14	NUMBER '1' (1, 4) (1, 5)
				15
				16	A comment generates a token here, unlike in the parser module. The
				17	comment token is followed by an NL or a NEWLINE token, depending on
				18	whether the line contains the completion of a statement.
				19
				20	>>> dump_tokens("if False:\\n"
				21	... " # NL\\n"
				22	... " True = False # NEWLINE\\n")
				23	NAME 'if' (1, 0) (1, 2)
				24	NAME 'False' (1, 3) (1, 8)
				25	OP ':' (1, 8) (1, 9)
				26	NEWLINE '\\n' (1, 9) (1, 10)
				27	COMMENT '# NL' (2, 4) (2, 8)
				28	NL '\\n' (2, 8) (2, 9)
				29	INDENT ' ' (3, 0) (3, 4)
				30	NAME 'True' (3, 4) (3, 8)
				31	OP '=' (3, 9) (3, 10)
				32	NAME 'False' (3, 11) (3, 16)
				33	COMMENT '# NEWLINE' (3, 17) (3, 26)
				34	NEWLINE '\\n' (3, 26) (3, 27)
				35	DEDENT '' (4, 0) (4, 0)
Tim Peters	147f9ae	2006-08-25 22:05:39 +0000	[diff] [blame]	36
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	37
				38	There will be a bunch more tests of specific source patterns.
				39
				40	The tokenize module also defines an untokenize function that should
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame]	41	regenerate the original program text from the tokens.
				42
				43	There are some standard formatting practices that are easy to get right.
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	44
				45	>>> roundtrip("if x == 1:\\n"
Tim Peters	147f9ae	2006-08-25 22:05:39 +0000	[diff] [blame]	46	... " print x\\n")
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame]	47	if x == 1:
				48	print x
				49
				50	Some people use different formatting conventions, which makes
				51	untokenize a little trickier. Note that this test involves trailing
Tim Peters	4582d7d	2006-08-25 22:26:21 +0000	[diff] [blame^]	52	whitespace after the colon. Note that we use hex escapes to make the
				53	two trailing blanks apparent in the expected output.
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame]	54
				55	>>> roundtrip("if x == 1 : \\n"
				56	... " print x\\n")
Tim Peters	4582d7d	2006-08-25 22:26:21 +0000	[diff] [blame^]	57	if x == 1 :\x20\x20
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame]	58	print x
				59
				60	Comments need to go in the right place.
				61
				62	>>> roundtrip("if x == 1:\\n"
				63	... " # A comment by itself.\\n"
				64	... " print x # Comment here, too.\\n"
				65	... " # Another comment.\\n"
				66	... "after_if = True\\n")
				67	if x == 1:
				68	# A comment by itself.
				69	print x # Comment here, too.
				70	# Another comment.
				71	after_if = True
				72
				73	>>> roundtrip("if (x # The comments need to go in the right place\\n"
				74	... " == 1):\\n"
				75	... " print 'x == 1'\\n")
				76	if (x # The comments need to go in the right place
				77	== 1):
				78	print 'x == 1'
				79
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	80	"""
				81
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	82	import os, glob, random
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	83	from cStringIO import StringIO
				84	from test.test_support import (verbose, findfile, is_resource_enabled,
				85	TestFailed)
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	86	from tokenize import (tokenize, generate_tokens, untokenize, tok_name,
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame]	87	ENDMARKER, NUMBER, NAME, OP, STRING, COMMENT)
Guido van Rossum	0874f7f	1997-10-27 22:15:06 +0000	[diff] [blame]	88
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	89	# Test roundtrip for `untokenize`. `f` is a file path. The source code in f
				90	# is tokenized, converted back to source code via tokenize.untokenize(),
				91	# and tokenized again from the latter. The test fails if the second
				92	# tokenization doesn't match the first.
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	93	def test_roundtrip(f):
				94	## print 'Testing:', f
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	95	fobj = open(f)
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	96	try:
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	97	fulltok = list(generate_tokens(fobj.readline))
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	98	finally:
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	99	fobj.close()
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	100
				101	t1 = [tok[:2] for tok in fulltok]
				102	newtext = untokenize(t1)
				103	readline = iter(newtext.splitlines(1)).next
				104	t2 = [tok[:2] for tok in generate_tokens(readline)]
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	105	if t1 != t2:
				106	raise TestFailed("untokenize() roundtrip failed for %r" % f)
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	107
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	108	def dump_tokens(s):
				109	"""Print out the tokens in s in a table format.
				110
				111	The ENDMARKER is omitted.
				112	"""
				113	f = StringIO(s)
				114	for type, token, start, end, line in generate_tokens(f.readline):
				115	if type == ENDMARKER:
				116	break
				117	type = tok_name[type]
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame]	118	print "%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals()
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	119
				120	def roundtrip(s):
				121	f = StringIO(s)
Jeremy Hylton	76467ba	2006-08-23 21:14:03 +0000	[diff] [blame]	122	source = untokenize(generate_tokens(f.readline))
				123	print source,
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	124
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	125	# This is an example from the docs, set up as a doctest.
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	126	def decistmt(s):
				127	"""Substitute Decimals for floats in a string of statements.
				128
				129	>>> from decimal import Decimal
				130	>>> s = 'print +21.3e-5*-.1234/81.7'
				131	>>> decistmt(s)
				132	"print +Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')"
				133
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	134	The format of the exponent is inherited from the platform C library.
				135	Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
				136	we're only showing 12 digits, and the 13th isn't close to 5, the
				137	rest of the output should be platform-independent.
				138
				139	>>> exec(s) #doctest: +ELLIPSIS
				140	-3.21716034272e-0...7
				141
				142	Output from calculations with Decimal should be identical across all
				143	platforms.
				144
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	145	>>> exec(decistmt(s))
				146	-3.217160342717258261933904529E-7
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	147	"""
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	148
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	149	result = []
				150	g = generate_tokens(StringIO(s).readline) # tokenize the string
				151	for toknum, tokval, _, _, _ in g:
				152	if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
				153	result.extend([
				154	(NAME, 'Decimal'),
				155	(OP, '('),
				156	(STRING, repr(tokval)),
				157	(OP, ')')
				158	])
				159	else:
				160	result.append((toknum, tokval))
				161	return untokenize(result)
				162
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	163	def test_main():
				164	if verbose:
				165	print 'starting...'
Raymond Hettinger	68c0453	2005-06-10 11:05:19 +0000	[diff] [blame]	166
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	167	# This displays the tokenization of tokenize_tests.py to stdout, and
				168	# regrtest.py checks that this equals the expected output (in the
				169	# test/output/ directory).
				170	f = open(findfile('tokenize_tests' + os.extsep + 'txt'))
				171	tokenize(f.readline)
				172	f.close()
				173
				174	# Now run test_roundtrip() over tokenize_test.py too, and over all
				175	# (if the "compiler" resource is enabled) or a small random sample (if
				176	# "compiler" is not enabled) of the test*.py files.
				177	f = findfile('tokenize_tests' + os.extsep + 'txt')
				178	test_roundtrip(f)
				179
				180	testdir = os.path.dirname(f) or os.curdir
				181	testfiles = glob.glob(testdir + os.sep + 'test*.py')
				182	if not is_resource_enabled('compiler'):
				183	testfiles = random.sample(testfiles, 10)
				184
				185	for f in testfiles:
				186	test_roundtrip(f)
				187
				188	# Test detecton of IndentationError.
				189	sampleBadText = """\
				190	def foo():
				191	bar
				192	baz
				193	"""
				194
				195	try:
				196	for tok in generate_tokens(StringIO(sampleBadText).readline):
				197	pass
				198	except IndentationError:
				199	pass
				200	else:
				201	raise TestFailed("Did not detect IndentationError:")
				202
				203	# Run the doctests in this module.
				204	from test import test_tokenize # i.e., this module
				205	from test.test_support import run_doctest
Jeremy Hylton	29bef0b	2006-08-23 18:37:43 +0000	[diff] [blame]	206	run_doctest(test_tokenize, verbose)
Tim Peters	ef57567	2006-03-31 03:17:30 +0000	[diff] [blame]	207
				208	if verbose:
				209	print 'finished'
				210
				211	if __name__ == "__main__":
				212	test_main()