Blame - Lib/test/test_robotparser.py - platform/external/python/cpython2

blob: b3d4a46056bb0d03d1d218c43b9185ee7daaea8b [file] [log] [blame]

Barry Warsaw	04f357c	2002-07-23 19:04:11 +0000	[diff] [blame]	1	import unittest, StringIO, robotparser
				2	from test import test_support
Antoine Pitrou	2960308	2011-07-08 19:40:15 +0200	[diff] [blame]	3	from urllib2 import urlopen, HTTPError
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	4
				5	class RobotTestCase(unittest.TestCase):
				6	def __init__(self, index, parser, url, good, agent):
				7	unittest.TestCase.__init__(self)
				8	if good:
				9	self.str = "RobotTest(%d, good, %s)" % (index, url)
				10	else:
				11	self.str = "RobotTest(%d, bad, %s)" % (index, url)
				12	self.parser = parser
				13	self.url = url
				14	self.good = good
				15	self.agent = agent
				16
				17	def runTest(self):
				18	if isinstance(self.url, tuple):
				19	agent, url = self.url
				20	else:
				21	url = self.url
				22	agent = self.agent
				23	if self.good:
Benjamin Peterson	5c8da86	2009-06-30 22:57:08 +0000	[diff] [blame]	24	self.assertTrue(self.parser.can_fetch(agent, url))
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	25	else:
Benjamin Peterson	5c8da86	2009-06-30 22:57:08 +0000	[diff] [blame]	26	self.assertFalse(self.parser.can_fetch(agent, url))
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	27
				28	def __str__(self):
				29	return self.str
				30
				31	tests = unittest.TestSuite()
				32
				33	def RobotTest(index, robots_txt, good_urls, bad_urls,
				34	agent="test_robotparser"):
Tim Peters	863ac44	2002-04-16 01:38:40 +0000	[diff] [blame]	35
				36	lines = StringIO.StringIO(robots_txt).readlines()
				37	parser = robotparser.RobotFileParser()
				38	parser.parse(lines)
				39	for url in good_urls:
				40	tests.addTest(RobotTestCase(index, parser, url, 1, agent))
				41	for url in bad_urls:
				42	tests.addTest(RobotTestCase(index, parser, url, 0, agent))
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	43
				44	# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
				45
				46	# 1.
				47	doc = """
				48	User-agent: *
				49	Disallow: /cyberworld/map/ # This is an infinite virtual URL space
				50	Disallow: /tmp/ # these will soon disappear
				51	Disallow: /foo.html
				52	"""
				53
				54	good = ['/','/test.html']
				55	bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
				56
				57	RobotTest(1, doc, good, bad)
				58
				59	# 2.
				60	doc = """
				61	# robots.txt for http://www.example.com/
				62
				63	User-agent: *
				64	Disallow: /cyberworld/map/ # This is an infinite virtual URL space
				65
				66	# Cybermapper knows where to go.
				67	User-agent: cybermapper
				68	Disallow:
				69
				70	"""
				71
				72	good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
				73	bad = ['/cyberworld/map/index.html']
				74
				75	RobotTest(2, doc, good, bad)
				76
				77	# 3.
				78	doc = """
				79	# go away
				80	User-agent: *
				81	Disallow: /
				82	"""
				83
				84	good = []
				85	bad = ['/cyberworld/map/index.html','/','/tmp/']
				86
				87	RobotTest(3, doc, good, bad)
				88
				89	# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
				90
				91	# 4.
				92	doc = """
				93	User-agent: figtree
				94	Disallow: /tmp
				95	Disallow: /a%3cd.html
				96	Disallow: /a%2fb.html
				97	Disallow: /%7ejoe/index.html
				98	"""
				99
				100	good = [] # XFAIL '/a/b.html'
				101	bad = ['/tmp','/tmp.html','/tmp/a.html',
				102	'/a%3cd.html','/a%3Cd.html','/a%2fb.html',
				103	'/~joe/index.html'
				104	]
				105
				106	RobotTest(4, doc, good, bad, 'figtree')
				107	RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
				108
				109	# 6.
				110	doc = """
				111	User-agent: *
				112	Disallow: /tmp/
				113	Disallow: /a%3Cd.html
				114	Disallow: /a/b.html
				115	Disallow: /%7ejoe/index.html
				116	"""
				117
				118	good = ['/tmp',] # XFAIL: '/a%2fb.html'
				119	bad = ['/tmp/','/tmp/a.html',
				120	'/a%3cd.html','/a%3Cd.html',"/a/b.html",
Tim Peters	863ac44	2002-04-16 01:38:40 +0000	[diff] [blame]	121	'/%7Ejoe/index.html']
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	122
				123	RobotTest(6, doc, good, bad)
				124
				125	# From bug report #523041
				126
				127	# 7.
				128	doc = """
				129	User-Agent: *
				130	Disallow: /.
				131	"""
				132
				133	good = ['/foo.html']
				134	bad = [] # Bug report says "/" should be denied, but that is not in the RFC
				135
				136	RobotTest(7, doc, good, bad)
				137
Skip Montanaro	1ef19f0	2008-07-27 00:49:02 +0000	[diff] [blame]	138	# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
				139
				140	# 8.
				141	doc = """
				142	User-agent: Googlebot
				143	Allow: /folder1/myfile.html
				144	Disallow: /folder1/
				145	"""
				146
				147	good = ['/folder1/myfile.html']
				148	bad = ['/folder1/anotherfile.html']
				149
				150	RobotTest(8, doc, good, bad, agent="Googlebot")
				151
				152	# 9. This file is incorrect because "Googlebot" is a substring of
				153	# "Googlebot-Mobile", so test 10 works just like test 9.
				154	doc = """
				155	User-agent: Googlebot
				156	Disallow: /
				157
				158	User-agent: Googlebot-Mobile
				159	Allow: /
				160	"""
				161
				162	good = []
				163	bad = ['/something.jpg']
				164
				165	RobotTest(9, doc, good, bad, agent="Googlebot")
				166
				167	good = []
				168	bad = ['/something.jpg']
				169
				170	RobotTest(10, doc, good, bad, agent="Googlebot-Mobile")
				171
				172	# 11. Get the order correct.
				173	doc = """
				174	User-agent: Googlebot-Mobile
				175	Allow: /
				176
				177	User-agent: Googlebot
				178	Disallow: /
				179	"""
				180
				181	good = []
				182	bad = ['/something.jpg']
				183
				184	RobotTest(11, doc, good, bad, agent="Googlebot")
				185
				186	good = ['/something.jpg']
				187	bad = []
				188
				189	RobotTest(12, doc, good, bad, agent="Googlebot-Mobile")
				190
				191
				192	# 13. Google also got the order wrong in #8. You need to specify the
				193	# URLs from more specific to more general.
				194	doc = """
				195	User-agent: Googlebot
				196	Allow: /folder1/myfile.html
				197	Disallow: /folder1/
				198	"""
				199
				200	good = ['/folder1/myfile.html']
				201	bad = ['/folder1/anotherfile.html']
				202
				203	RobotTest(13, doc, good, bad, agent="googlebot")
				204
				205
Senthil Kumaran	a4f79f9	2010-07-28 16:35:35 +0000	[diff] [blame]	206	# 14. For issue #6325 (query string support)
				207	doc = """
				208	User-agent: *
				209	Disallow: /some/path?name=value
				210	"""
				211
				212	good = ['/some/path']
				213	bad = ['/some/path?name=value']
				214
				215	RobotTest(14, doc, good, bad)
				216
Georg Brandl	2bd953e	2010-08-01 20:59:03 +0000	[diff] [blame]	217	# 15. For issue #4108 (obey first * entry)
				218	doc = """
				219	User-agent: *
				220	Disallow: /some/path
				221
				222	User-agent: *
				223	Disallow: /another/path
				224	"""
				225
				226	good = ['/another/path']
				227	bad = ['/some/path']
				228
				229	RobotTest(15, doc, good, bad)
				230
Skip Montanaro	1ef19f0	2008-07-27 00:49:02 +0000	[diff] [blame]	231
Florent Xicluna	f37592f	2010-04-02 17:26:42 +0000	[diff] [blame]	232	class NetworkTestCase(unittest.TestCase):
				233
				234	def testPasswordProtectedSite(self):
Skip Montanaro	1a41313	2007-08-28 23:22:52 +0000	[diff] [blame]	235	test_support.requires('network')
Antoine Pitrou	c818ed4	2010-09-07 21:40:25 +0000	[diff] [blame]	236	with test_support.transient_internet('mueblesmoraleda.com'):
				237	url = 'http://mueblesmoraleda.com'
Antoine Pitrou	2960308	2011-07-08 19:40:15 +0200	[diff] [blame]	238	robots_url = url + "/robots.txt"
				239	# First check the URL is usable for our purposes, since the
				240	# test site is a bit flaky.
				241	try:
				242	urlopen(robots_url)
				243	except HTTPError as e:
				244	if e.code not in {401, 403}:
				245	self.skipTest(
				246	"%r should return a 401 or 403 HTTP error, not %r"
				247	% (robots_url, e.code))
				248	else:
				249	self.skipTest(
				250	"%r should return a 401 or 403 HTTP error, not succeed"
				251	% (robots_url))
Antoine Pitrou	c818ed4	2010-09-07 21:40:25 +0000	[diff] [blame]	252	parser = robotparser.RobotFileParser()
				253	parser.set_url(url)
				254	try:
				255	parser.read()
				256	except IOError:
				257	self.skipTest('%s is unavailable' % url)
Antoine Pitrou	2960308	2011-07-08 19:40:15 +0200	[diff] [blame]	258	self.assertEqual(parser.can_fetch("*", robots_url), False)
Skip Montanaro	1a41313	2007-08-28 23:22:52 +0000	[diff] [blame]	259
Florent Xicluna	f37592f	2010-04-02 17:26:42 +0000	[diff] [blame]	260	def testPythonOrg(self):
				261	test_support.requires('network')
Antoine Pitrou	c818ed4	2010-09-07 21:40:25 +0000	[diff] [blame]	262	with test_support.transient_internet('www.python.org'):
				263	parser = robotparser.RobotFileParser(
				264	"http://www.python.org/robots.txt")
				265	parser.read()
				266	self.assertTrue(
				267	parser.can_fetch("*", "http://www.python.org/robots.txt"))
Florent Xicluna	f37592f	2010-04-02 17:26:42 +0000	[diff] [blame]	268
				269
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	270	def test_main():
Collin Winter	c2898c5	2007-04-25 17:29:52 +0000	[diff] [blame]	271	test_support.run_unittest(tests)
Florent Xicluna	f37592f	2010-04-02 17:26:42 +0000	[diff] [blame]	272	test_support.run_unittest(NetworkTestCase)
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	273
				274	if __name__=='__main__':
Georg Brandl	730c818	2008-07-18 10:29:30 +0000	[diff] [blame]	275	test_support.verbose = 1
Collin Winter	c2898c5	2007-04-25 17:29:52 +0000	[diff] [blame]	276	test_main()