Blame - Lib/test/test_robotparser.py - platform/external/python/cpython3

blob: 68a5e9ce2fd0ddad4e620a3cc85ecfe584b71991 [file] [log] [blame]

Guido van Rossum	34d1928	2007-08-09 01:03:29 +0000	[diff] [blame]	1	import io
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2	import unittest
				3	import urllib.robotparser
Antoine Pitrou	95531ea	2011-07-08 19:43:51 +0200	[diff] [blame]	4	from urllib.error import URLError, HTTPError
				5	from urllib.request import urlopen
Benjamin Peterson	ee8712c	2008-05-20 21:35:26 +0000	[diff] [blame]	6	from test import support
Senthil Kumaran	601d6ec	2014-06-25 02:58:15 -0700	[diff] [blame]	7	import threading
				8	from http.server import BaseHTTPRequestHandler, HTTPServer
				9
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	10
				11	class RobotTestCase(unittest.TestCase):
Ezio Melotti	0fb37ea	2013-03-12 07:49:12 +0200	[diff] [blame]	12	def __init__(self, index=None, parser=None, url=None, good=None, agent=None):
				13	# workaround to make unittest discovery work (see #17066)
				14	if not isinstance(index, int):
				15	return
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	16	unittest.TestCase.__init__(self)
				17	if good:
				18	self.str = "RobotTest(%d, good, %s)" % (index, url)
				19	else:
				20	self.str = "RobotTest(%d, bad, %s)" % (index, url)
				21	self.parser = parser
				22	self.url = url
				23	self.good = good
				24	self.agent = agent
				25
				26	def runTest(self):
				27	if isinstance(self.url, tuple):
				28	agent, url = self.url
				29	else:
				30	url = self.url
				31	agent = self.agent
				32	if self.good:
Benjamin Peterson	c9c0f20	2009-06-30 23:06:06 +0000	[diff] [blame]	33	self.assertTrue(self.parser.can_fetch(agent, url))
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	34	else:
Benjamin Peterson	c9c0f20	2009-06-30 23:06:06 +0000	[diff] [blame]	35	self.assertFalse(self.parser.can_fetch(agent, url))
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	36
				37	def __str__(self):
				38	return self.str
				39
				40	tests = unittest.TestSuite()
				41
				42	def RobotTest(index, robots_txt, good_urls, bad_urls,
				43	agent="test_robotparser"):
Tim Peters	863ac44	2002-04-16 01:38:40 +0000	[diff] [blame]	44
Guido van Rossum	34d1928	2007-08-09 01:03:29 +0000	[diff] [blame]	45	lines = io.StringIO(robots_txt).readlines()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	46	parser = urllib.robotparser.RobotFileParser()
Tim Peters	863ac44	2002-04-16 01:38:40 +0000	[diff] [blame]	47	parser.parse(lines)
				48	for url in good_urls:
				49	tests.addTest(RobotTestCase(index, parser, url, 1, agent))
				50	for url in bad_urls:
				51	tests.addTest(RobotTestCase(index, parser, url, 0, agent))
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	52
				53	# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
				54
				55	# 1.
				56	doc = """
				57	User-agent: *
				58	Disallow: /cyberworld/map/ # This is an infinite virtual URL space
				59	Disallow: /tmp/ # these will soon disappear
				60	Disallow: /foo.html
				61	"""
				62
				63	good = ['/','/test.html']
				64	bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
				65
				66	RobotTest(1, doc, good, bad)
				67
				68	# 2.
				69	doc = """
				70	# robots.txt for http://www.example.com/
				71
				72	User-agent: *
				73	Disallow: /cyberworld/map/ # This is an infinite virtual URL space
				74
				75	# Cybermapper knows where to go.
				76	User-agent: cybermapper
				77	Disallow:
				78
				79	"""
				80
				81	good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
				82	bad = ['/cyberworld/map/index.html']
				83
				84	RobotTest(2, doc, good, bad)
				85
				86	# 3.
				87	doc = """
				88	# go away
				89	User-agent: *
				90	Disallow: /
				91	"""
				92
				93	good = []
				94	bad = ['/cyberworld/map/index.html','/','/tmp/']
				95
				96	RobotTest(3, doc, good, bad)
				97
				98	# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
				99
				100	# 4.
				101	doc = """
				102	User-agent: figtree
				103	Disallow: /tmp
				104	Disallow: /a%3cd.html
				105	Disallow: /a%2fb.html
				106	Disallow: /%7ejoe/index.html
				107	"""
				108
				109	good = [] # XFAIL '/a/b.html'
				110	bad = ['/tmp','/tmp.html','/tmp/a.html',
				111	'/a%3cd.html','/a%3Cd.html','/a%2fb.html',
				112	'/~joe/index.html'
				113	]
				114
				115	RobotTest(4, doc, good, bad, 'figtree')
				116	RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
				117
				118	# 6.
				119	doc = """
				120	User-agent: *
				121	Disallow: /tmp/
				122	Disallow: /a%3Cd.html
				123	Disallow: /a/b.html
				124	Disallow: /%7ejoe/index.html
				125	"""
				126
				127	good = ['/tmp',] # XFAIL: '/a%2fb.html'
				128	bad = ['/tmp/','/tmp/a.html',
				129	'/a%3cd.html','/a%3Cd.html',"/a/b.html",
Tim Peters	863ac44	2002-04-16 01:38:40 +0000	[diff] [blame]	130	'/%7Ejoe/index.html']
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	131
				132	RobotTest(6, doc, good, bad)
				133
				134	# From bug report #523041
				135
				136	# 7.
				137	doc = """
				138	User-Agent: *
				139	Disallow: /.
				140	"""
				141
				142	good = ['/foo.html']
				143	bad = [] # Bug report says "/" should be denied, but that is not in the RFC
				144
				145	RobotTest(7, doc, good, bad)
				146
Benjamin Peterson	d631371	2008-07-31 16:23:04 +0000	[diff] [blame]	147	# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
				148
				149	# 8.
				150	doc = """
				151	User-agent: Googlebot
				152	Allow: /folder1/myfile.html
				153	Disallow: /folder1/
				154	"""
				155
				156	good = ['/folder1/myfile.html']
				157	bad = ['/folder1/anotherfile.html']
				158
				159	RobotTest(8, doc, good, bad, agent="Googlebot")
				160
				161	# 9. This file is incorrect because "Googlebot" is a substring of
				162	# "Googlebot-Mobile", so test 10 works just like test 9.
				163	doc = """
				164	User-agent: Googlebot
				165	Disallow: /
				166
				167	User-agent: Googlebot-Mobile
				168	Allow: /
				169	"""
				170
				171	good = []
				172	bad = ['/something.jpg']
				173
				174	RobotTest(9, doc, good, bad, agent="Googlebot")
				175
				176	good = []
				177	bad = ['/something.jpg']
				178
				179	RobotTest(10, doc, good, bad, agent="Googlebot-Mobile")
				180
				181	# 11. Get the order correct.
				182	doc = """
				183	User-agent: Googlebot-Mobile
				184	Allow: /
				185
				186	User-agent: Googlebot
				187	Disallow: /
				188	"""
				189
				190	good = []
				191	bad = ['/something.jpg']
				192
				193	RobotTest(11, doc, good, bad, agent="Googlebot")
				194
				195	good = ['/something.jpg']
				196	bad = []
				197
				198	RobotTest(12, doc, good, bad, agent="Googlebot-Mobile")
				199
				200
				201	# 13. Google also got the order wrong in #8. You need to specify the
				202	# URLs from more specific to more general.
				203	doc = """
				204	User-agent: Googlebot
				205	Allow: /folder1/myfile.html
				206	Disallow: /folder1/
				207	"""
				208
				209	good = ['/folder1/myfile.html']
				210	bad = ['/folder1/anotherfile.html']
				211
				212	RobotTest(13, doc, good, bad, agent="googlebot")
				213
				214
Senthil Kumaran	3f8ab96	2010-07-28 16:27:56 +0000	[diff] [blame]	215	# 14. For issue #6325 (query string support)
				216	doc = """
				217	User-agent: *
				218	Disallow: /some/path?name=value
				219	"""
				220
				221	good = ['/some/path']
				222	bad = ['/some/path?name=value']
				223
				224	RobotTest(14, doc, good, bad)
				225
Georg Brandl	0a0fc07	2010-07-29 17:55:01 +0000	[diff] [blame]	226	# 15. For issue #4108 (obey first * entry)
				227	doc = """
				228	User-agent: *
				229	Disallow: /some/path
				230
				231	User-agent: *
				232	Disallow: /another/path
				233	"""
				234
				235	good = ['/another/path']
				236	bad = ['/some/path']
				237
				238	RobotTest(15, doc, good, bad)
				239
Senthil Kumaran	c70a6ae	2013-05-29 05:54:31 -0700	[diff] [blame]	240	# 16. Empty query (issue #17403). Normalizing the url first.
				241	doc = """
				242	User-agent: *
				243	Allow: /some/path?
				244	Disallow: /another/path?
				245	"""
				246
				247	good = ['/some/path?']
				248	bad = ['/another/path?']
				249
				250	RobotTest(16, doc, good, bad)
				251
Benjamin Peterson	d631371	2008-07-31 16:23:04 +0000	[diff] [blame]	252
Senthil Kumaran	601d6ec	2014-06-25 02:58:15 -0700	[diff] [blame]	253	class RobotHandler(BaseHTTPRequestHandler):
				254
				255	def do_GET(self):
				256	self.send_error(403, "Forbidden access")
				257
				258	def log_message(self, format, *args):
				259	pass
				260
				261
				262	class PasswordProtectedSiteTestCase(unittest.TestCase):
				263
				264	def setUp(self):
				265	self.server = HTTPServer((support.HOST, 0), RobotHandler)
				266
				267	self.t = threading.Thread(
				268	name='HTTPServer serving',
				269	target=self.server.serve_forever,
				270	# Short poll interval to make the test finish quickly.
				271	# Time between requests is short enough that we won't wake
				272	# up spuriously too many times.
				273	kwargs={'poll_interval':0.01})
				274	self.t.daemon = True # In case this function raises.
				275	self.t.start()
				276
				277	def tearDown(self):
				278	self.server.shutdown()
				279	self.t.join()
				280	self.server.server_close()
				281
				282	def runTest(self):
				283	self.testPasswordProtectedSite()
Jeremy Hylton	73fd46d	2008-07-18 20:59:44 +0000	[diff] [blame]	284
				285	def testPasswordProtectedSite(self):
Senthil Kumaran	601d6ec	2014-06-25 02:58:15 -0700	[diff] [blame]	286	addr = self.server.server_address
				287	url = 'http://' + support.HOST + ':' + str(addr[1])
				288	robots_url = url + "/robots.txt"
				289	parser = urllib.robotparser.RobotFileParser()
				290	parser.set_url(url)
				291	parser.read()
				292	self.assertFalse(parser.can_fetch("*", robots_url))
				293
				294	def __str__(self):
				295	return '%s' % self.__class__.__name__
				296
				297	class NetworkTestCase(unittest.TestCase):
Thomas Wouters	47b49bf	2007-08-30 22:15:33 +0000	[diff] [blame]	298
Georg Brandl	89e5671	2014-02-23 08:45:15 +0100	[diff] [blame]	299	@unittest.skip('does not handle the gzip encoding delivered by pydotorg')
Jeremy Hylton	73fd46d	2008-07-18 20:59:44 +0000	[diff] [blame]	300	def testPythonOrg(self):
Florent Xicluna	41fe615	2010-04-02 18:52:12 +0000	[diff] [blame]	301	support.requires('network')
Antoine Pitrou	8bc0903	2010-09-07 21:09:09 +0000	[diff] [blame]	302	with support.transient_internet('www.python.org'):
				303	parser = urllib.robotparser.RobotFileParser(
				304	"http://www.python.org/robots.txt")
				305	parser.read()
				306	self.assertTrue(
				307	parser.can_fetch("*", "http://www.python.org/robots.txt"))
Jeremy Hylton	73fd46d	2008-07-18 20:59:44 +0000	[diff] [blame]	308
Ezio Melotti	0fb37ea	2013-03-12 07:49:12 +0200	[diff] [blame]	309	def load_tests(loader, suite, pattern):
				310	suite = unittest.makeSuite(NetworkTestCase)
				311	suite.addTest(tests)
Senthil Kumaran	601d6ec	2014-06-25 02:58:15 -0700	[diff] [blame]	312	suite.addTest(PasswordProtectedSiteTestCase())
Ezio Melotti	0fb37ea	2013-03-12 07:49:12 +0200	[diff] [blame]	313	return suite
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	314
				315	if __name__=='__main__':
Ezio Melotti	0fb37ea	2013-03-12 07:49:12 +0200	[diff] [blame]	316	unittest.main()