Blame - Lib/test/test_robotparser.py - platform/external/python/cpython2

blob: 36ac941ab8780b6e0271ef93f5b94f920089dc74 [file] [log] [blame]

Barry Warsaw	04f357c	2002-07-23 19:04:11 +0000	[diff] [blame]	1	import unittest, StringIO, robotparser
				2	from test import test_support
Antoine Pitrou	2960308	2011-07-08 19:40:15 +0200	[diff] [blame]	3	from urllib2 import urlopen, HTTPError
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	4
Ned Deily	c727533	2014-03-26 23:25:02 -0700	[diff] [blame]	5	HAVE_HTTPS = True
				6	try:
				7	from urllib2 import HTTPSHandler
				8	except ImportError:
				9	HAVE_HTTPS = False
				10
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	11	class RobotTestCase(unittest.TestCase):
				12	def __init__(self, index, parser, url, good, agent):
				13	unittest.TestCase.__init__(self)
				14	if good:
				15	self.str = "RobotTest(%d, good, %s)" % (index, url)
				16	else:
				17	self.str = "RobotTest(%d, bad, %s)" % (index, url)
				18	self.parser = parser
				19	self.url = url
				20	self.good = good
				21	self.agent = agent
				22
				23	def runTest(self):
				24	if isinstance(self.url, tuple):
				25	agent, url = self.url
				26	else:
				27	url = self.url
				28	agent = self.agent
				29	if self.good:
Benjamin Peterson	5c8da86	2009-06-30 22:57:08 +0000	[diff] [blame]	30	self.assertTrue(self.parser.can_fetch(agent, url))
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	31	else:
Benjamin Peterson	5c8da86	2009-06-30 22:57:08 +0000	[diff] [blame]	32	self.assertFalse(self.parser.can_fetch(agent, url))
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	33
				34	def __str__(self):
				35	return self.str
				36
				37	tests = unittest.TestSuite()
				38
				39	def RobotTest(index, robots_txt, good_urls, bad_urls,
				40	agent="test_robotparser"):
Tim Peters	863ac44	2002-04-16 01:38:40 +0000	[diff] [blame]	41
				42	lines = StringIO.StringIO(robots_txt).readlines()
				43	parser = robotparser.RobotFileParser()
				44	parser.parse(lines)
				45	for url in good_urls:
				46	tests.addTest(RobotTestCase(index, parser, url, 1, agent))
				47	for url in bad_urls:
				48	tests.addTest(RobotTestCase(index, parser, url, 0, agent))
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	49
				50	# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
				51
				52	# 1.
				53	doc = """
				54	User-agent: *
				55	Disallow: /cyberworld/map/ # This is an infinite virtual URL space
				56	Disallow: /tmp/ # these will soon disappear
				57	Disallow: /foo.html
				58	"""
				59
				60	good = ['/','/test.html']
				61	bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
				62
				63	RobotTest(1, doc, good, bad)
				64
				65	# 2.
				66	doc = """
				67	# robots.txt for http://www.example.com/
				68
				69	User-agent: *
				70	Disallow: /cyberworld/map/ # This is an infinite virtual URL space
				71
				72	# Cybermapper knows where to go.
				73	User-agent: cybermapper
				74	Disallow:
				75
				76	"""
				77
				78	good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
				79	bad = ['/cyberworld/map/index.html']
				80
				81	RobotTest(2, doc, good, bad)
				82
				83	# 3.
				84	doc = """
				85	# go away
				86	User-agent: *
				87	Disallow: /
				88	"""
				89
				90	good = []
				91	bad = ['/cyberworld/map/index.html','/','/tmp/']
				92
				93	RobotTest(3, doc, good, bad)
				94
				95	# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
				96
				97	# 4.
				98	doc = """
				99	User-agent: figtree
				100	Disallow: /tmp
				101	Disallow: /a%3cd.html
				102	Disallow: /a%2fb.html
				103	Disallow: /%7ejoe/index.html
				104	"""
				105
				106	good = [] # XFAIL '/a/b.html'
				107	bad = ['/tmp','/tmp.html','/tmp/a.html',
				108	'/a%3cd.html','/a%3Cd.html','/a%2fb.html',
				109	'/~joe/index.html'
				110	]
				111
				112	RobotTest(4, doc, good, bad, 'figtree')
				113	RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
				114
				115	# 6.
				116	doc = """
				117	User-agent: *
				118	Disallow: /tmp/
				119	Disallow: /a%3Cd.html
				120	Disallow: /a/b.html
				121	Disallow: /%7ejoe/index.html
				122	"""
				123
				124	good = ['/tmp',] # XFAIL: '/a%2fb.html'
				125	bad = ['/tmp/','/tmp/a.html',
				126	'/a%3cd.html','/a%3Cd.html',"/a/b.html",
Tim Peters	863ac44	2002-04-16 01:38:40 +0000	[diff] [blame]	127	'/%7Ejoe/index.html']
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	128
				129	RobotTest(6, doc, good, bad)
				130
				131	# From bug report #523041
				132
				133	# 7.
				134	doc = """
				135	User-Agent: *
				136	Disallow: /.
				137	"""
				138
				139	good = ['/foo.html']
				140	bad = [] # Bug report says "/" should be denied, but that is not in the RFC
				141
				142	RobotTest(7, doc, good, bad)
				143
Skip Montanaro	1ef19f0	2008-07-27 00:49:02 +0000	[diff] [blame]	144	# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
				145
				146	# 8.
				147	doc = """
				148	User-agent: Googlebot
				149	Allow: /folder1/myfile.html
				150	Disallow: /folder1/
				151	"""
				152
				153	good = ['/folder1/myfile.html']
				154	bad = ['/folder1/anotherfile.html']
				155
				156	RobotTest(8, doc, good, bad, agent="Googlebot")
				157
				158	# 9. This file is incorrect because "Googlebot" is a substring of
				159	# "Googlebot-Mobile", so test 10 works just like test 9.
				160	doc = """
				161	User-agent: Googlebot
				162	Disallow: /
				163
				164	User-agent: Googlebot-Mobile
				165	Allow: /
				166	"""
				167
				168	good = []
				169	bad = ['/something.jpg']
				170
				171	RobotTest(9, doc, good, bad, agent="Googlebot")
				172
				173	good = []
				174	bad = ['/something.jpg']
				175
				176	RobotTest(10, doc, good, bad, agent="Googlebot-Mobile")
				177
				178	# 11. Get the order correct.
				179	doc = """
				180	User-agent: Googlebot-Mobile
				181	Allow: /
				182
				183	User-agent: Googlebot
				184	Disallow: /
				185	"""
				186
				187	good = []
				188	bad = ['/something.jpg']
				189
				190	RobotTest(11, doc, good, bad, agent="Googlebot")
				191
				192	good = ['/something.jpg']
				193	bad = []
				194
				195	RobotTest(12, doc, good, bad, agent="Googlebot-Mobile")
				196
				197
				198	# 13. Google also got the order wrong in #8. You need to specify the
				199	# URLs from more specific to more general.
				200	doc = """
				201	User-agent: Googlebot
				202	Allow: /folder1/myfile.html
				203	Disallow: /folder1/
				204	"""
				205
				206	good = ['/folder1/myfile.html']
				207	bad = ['/folder1/anotherfile.html']
				208
				209	RobotTest(13, doc, good, bad, agent="googlebot")
				210
				211
Senthil Kumaran	a4f79f9	2010-07-28 16:35:35 +0000	[diff] [blame]	212	# 14. For issue #6325 (query string support)
				213	doc = """
				214	User-agent: *
				215	Disallow: /some/path?name=value
				216	"""
				217
				218	good = ['/some/path']
				219	bad = ['/some/path?name=value']
				220
				221	RobotTest(14, doc, good, bad)
				222
Georg Brandl	2bd953e	2010-08-01 20:59:03 +0000	[diff] [blame]	223	# 15. For issue #4108 (obey first * entry)
				224	doc = """
				225	User-agent: *
				226	Disallow: /some/path
				227
				228	User-agent: *
				229	Disallow: /another/path
				230	"""
				231
				232	good = ['/another/path']
				233	bad = ['/some/path']
				234
				235	RobotTest(15, doc, good, bad)
				236
Senthil Kumaran	2c4810e	2013-05-29 05:58:47 -0700	[diff] [blame]	237	# 16. Empty query (issue #17403). Normalizing the url first.
				238	doc = """
				239	User-agent: *
				240	Allow: /some/path?
				241	Disallow: /another/path?
				242	"""
				243
				244	good = ['/some/path?']
				245	bad = ['/another/path?']
				246
				247	RobotTest(16, doc, good, bad)
				248
Skip Montanaro	1ef19f0	2008-07-27 00:49:02 +0000	[diff] [blame]	249
Florent Xicluna	f37592f	2010-04-02 17:26:42 +0000	[diff] [blame]	250	class NetworkTestCase(unittest.TestCase):
				251
				252	def testPasswordProtectedSite(self):
Skip Montanaro	1a41313	2007-08-28 23:22:52 +0000	[diff] [blame]	253	test_support.requires('network')
Antoine Pitrou	c818ed4	2010-09-07 21:40:25 +0000	[diff] [blame]	254	with test_support.transient_internet('mueblesmoraleda.com'):
				255	url = 'http://mueblesmoraleda.com'
Antoine Pitrou	2960308	2011-07-08 19:40:15 +0200	[diff] [blame]	256	robots_url = url + "/robots.txt"
				257	# First check the URL is usable for our purposes, since the
				258	# test site is a bit flaky.
				259	try:
				260	urlopen(robots_url)
				261	except HTTPError as e:
				262	if e.code not in {401, 403}:
				263	self.skipTest(
				264	"%r should return a 401 or 403 HTTP error, not %r"
				265	% (robots_url, e.code))
				266	else:
				267	self.skipTest(
				268	"%r should return a 401 or 403 HTTP error, not succeed"
				269	% (robots_url))
Antoine Pitrou	c818ed4	2010-09-07 21:40:25 +0000	[diff] [blame]	270	parser = robotparser.RobotFileParser()
				271	parser.set_url(url)
				272	try:
				273	parser.read()
				274	except IOError:
				275	self.skipTest('%s is unavailable' % url)
Antoine Pitrou	2960308	2011-07-08 19:40:15 +0200	[diff] [blame]	276	self.assertEqual(parser.can_fetch("*", robots_url), False)
Skip Montanaro	1a41313	2007-08-28 23:22:52 +0000	[diff] [blame]	277
Ned Deily	c727533	2014-03-26 23:25:02 -0700	[diff] [blame]	278	@unittest.skipUnless(HAVE_HTTPS, 'need SSL support to download license')
Florent Xicluna	f37592f	2010-04-02 17:26:42 +0000	[diff] [blame]	279	def testPythonOrg(self):
				280	test_support.requires('network')
Antoine Pitrou	c818ed4	2010-09-07 21:40:25 +0000	[diff] [blame]	281	with test_support.transient_internet('www.python.org'):
				282	parser = robotparser.RobotFileParser(
				283	"http://www.python.org/robots.txt")
				284	parser.read()
				285	self.assertTrue(
				286	parser.can_fetch("*", "http://www.python.org/robots.txt"))
Florent Xicluna	f37592f	2010-04-02 17:26:42 +0000	[diff] [blame]	287
				288
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	289	def test_main():
Collin Winter	c2898c5	2007-04-25 17:29:52 +0000	[diff] [blame]	290	test_support.run_unittest(tests)
Florent Xicluna	f37592f	2010-04-02 17:26:42 +0000	[diff] [blame]	291	test_support.run_unittest(NetworkTestCase)
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	292
				293	if __name__=='__main__':
Georg Brandl	730c818	2008-07-18 10:29:30 +0000	[diff] [blame]	294	test_support.verbose = 1
Collin Winter	c2898c5	2007-04-25 17:29:52 +0000	[diff] [blame]	295	test_main()