Blame - Lib/test/test_robotparser.py - platform/external/python/cpython3

blob: d1dfd9eeec026d4d501b1debca900bef6be1e50c [file] [log] [blame]

Guido van Rossum	34d1928	2007-08-09 01:03:29 +0000	[diff] [blame]	1	import io
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	2	import unittest
				3	import urllib.robotparser
Antoine Pitrou	95531ea	2011-07-08 19:43:51 +0200	[diff] [blame]	4	from urllib.error import URLError, HTTPError
				5	from urllib.request import urlopen
Benjamin Peterson	ee8712c	2008-05-20 21:35:26 +0000	[diff] [blame]	6	from test import support
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	7
				8	class RobotTestCase(unittest.TestCase):
Ezio Melotti	0fb37ea	2013-03-12 07:49:12 +0200	[diff] [blame]	9	def __init__(self, index=None, parser=None, url=None, good=None, agent=None):
				10	# workaround to make unittest discovery work (see #17066)
				11	if not isinstance(index, int):
				12	return
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	13	unittest.TestCase.__init__(self)
				14	if good:
				15	self.str = "RobotTest(%d, good, %s)" % (index, url)
				16	else:
				17	self.str = "RobotTest(%d, bad, %s)" % (index, url)
				18	self.parser = parser
				19	self.url = url
				20	self.good = good
				21	self.agent = agent
				22
				23	def runTest(self):
				24	if isinstance(self.url, tuple):
				25	agent, url = self.url
				26	else:
				27	url = self.url
				28	agent = self.agent
				29	if self.good:
Benjamin Peterson	c9c0f20	2009-06-30 23:06:06 +0000	[diff] [blame]	30	self.assertTrue(self.parser.can_fetch(agent, url))
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	31	else:
Benjamin Peterson	c9c0f20	2009-06-30 23:06:06 +0000	[diff] [blame]	32	self.assertFalse(self.parser.can_fetch(agent, url))
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	33
				34	def __str__(self):
				35	return self.str
				36
				37	tests = unittest.TestSuite()
				38
				39	def RobotTest(index, robots_txt, good_urls, bad_urls,
				40	agent="test_robotparser"):
Tim Peters	863ac44	2002-04-16 01:38:40 +0000	[diff] [blame]	41
Guido van Rossum	34d1928	2007-08-09 01:03:29 +0000	[diff] [blame]	42	lines = io.StringIO(robots_txt).readlines()
Jeremy Hylton	1afc169	2008-06-18 20:49:58 +0000	[diff] [blame]	43	parser = urllib.robotparser.RobotFileParser()
Tim Peters	863ac44	2002-04-16 01:38:40 +0000	[diff] [blame]	44	parser.parse(lines)
				45	for url in good_urls:
				46	tests.addTest(RobotTestCase(index, parser, url, 1, agent))
				47	for url in bad_urls:
				48	tests.addTest(RobotTestCase(index, parser, url, 0, agent))
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	49
				50	# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
				51
				52	# 1.
				53	doc = """
				54	User-agent: *
				55	Disallow: /cyberworld/map/ # This is an infinite virtual URL space
				56	Disallow: /tmp/ # these will soon disappear
				57	Disallow: /foo.html
				58	"""
				59
				60	good = ['/','/test.html']
				61	bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
				62
				63	RobotTest(1, doc, good, bad)
				64
				65	# 2.
				66	doc = """
				67	# robots.txt for http://www.example.com/
				68
				69	User-agent: *
				70	Disallow: /cyberworld/map/ # This is an infinite virtual URL space
				71
				72	# Cybermapper knows where to go.
				73	User-agent: cybermapper
				74	Disallow:
				75
				76	"""
				77
				78	good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
				79	bad = ['/cyberworld/map/index.html']
				80
				81	RobotTest(2, doc, good, bad)
				82
				83	# 3.
				84	doc = """
				85	# go away
				86	User-agent: *
				87	Disallow: /
				88	"""
				89
				90	good = []
				91	bad = ['/cyberworld/map/index.html','/','/tmp/']
				92
				93	RobotTest(3, doc, good, bad)
				94
				95	# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
				96
				97	# 4.
				98	doc = """
				99	User-agent: figtree
				100	Disallow: /tmp
				101	Disallow: /a%3cd.html
				102	Disallow: /a%2fb.html
				103	Disallow: /%7ejoe/index.html
				104	"""
				105
				106	good = [] # XFAIL '/a/b.html'
				107	bad = ['/tmp','/tmp.html','/tmp/a.html',
				108	'/a%3cd.html','/a%3Cd.html','/a%2fb.html',
				109	'/~joe/index.html'
				110	]
				111
				112	RobotTest(4, doc, good, bad, 'figtree')
				113	RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
				114
				115	# 6.
				116	doc = """
				117	User-agent: *
				118	Disallow: /tmp/
				119	Disallow: /a%3Cd.html
				120	Disallow: /a/b.html
				121	Disallow: /%7ejoe/index.html
				122	"""
				123
				124	good = ['/tmp',] # XFAIL: '/a%2fb.html'
				125	bad = ['/tmp/','/tmp/a.html',
				126	'/a%3cd.html','/a%3Cd.html',"/a/b.html",
Tim Peters	863ac44	2002-04-16 01:38:40 +0000	[diff] [blame]	127	'/%7Ejoe/index.html']
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	128
				129	RobotTest(6, doc, good, bad)
				130
				131	# From bug report #523041
				132
				133	# 7.
				134	doc = """
				135	User-Agent: *
				136	Disallow: /.
				137	"""
				138
				139	good = ['/foo.html']
				140	bad = [] # Bug report says "/" should be denied, but that is not in the RFC
				141
				142	RobotTest(7, doc, good, bad)
				143
Benjamin Peterson	d631371	2008-07-31 16:23:04 +0000	[diff] [blame]	144	# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
				145
				146	# 8.
				147	doc = """
				148	User-agent: Googlebot
				149	Allow: /folder1/myfile.html
				150	Disallow: /folder1/
				151	"""
				152
				153	good = ['/folder1/myfile.html']
				154	bad = ['/folder1/anotherfile.html']
				155
				156	RobotTest(8, doc, good, bad, agent="Googlebot")
				157
				158	# 9. This file is incorrect because "Googlebot" is a substring of
				159	# "Googlebot-Mobile", so test 10 works just like test 9.
				160	doc = """
				161	User-agent: Googlebot
				162	Disallow: /
				163
				164	User-agent: Googlebot-Mobile
				165	Allow: /
				166	"""
				167
				168	good = []
				169	bad = ['/something.jpg']
				170
				171	RobotTest(9, doc, good, bad, agent="Googlebot")
				172
				173	good = []
				174	bad = ['/something.jpg']
				175
				176	RobotTest(10, doc, good, bad, agent="Googlebot-Mobile")
				177
				178	# 11. Get the order correct.
				179	doc = """
				180	User-agent: Googlebot-Mobile
				181	Allow: /
				182
				183	User-agent: Googlebot
				184	Disallow: /
				185	"""
				186
				187	good = []
				188	bad = ['/something.jpg']
				189
				190	RobotTest(11, doc, good, bad, agent="Googlebot")
				191
				192	good = ['/something.jpg']
				193	bad = []
				194
				195	RobotTest(12, doc, good, bad, agent="Googlebot-Mobile")
				196
				197
				198	# 13. Google also got the order wrong in #8. You need to specify the
				199	# URLs from more specific to more general.
				200	doc = """
				201	User-agent: Googlebot
				202	Allow: /folder1/myfile.html
				203	Disallow: /folder1/
				204	"""
				205
				206	good = ['/folder1/myfile.html']
				207	bad = ['/folder1/anotherfile.html']
				208
				209	RobotTest(13, doc, good, bad, agent="googlebot")
				210
				211
Senthil Kumaran	3f8ab96	2010-07-28 16:27:56 +0000	[diff] [blame]	212	# 14. For issue #6325 (query string support)
				213	doc = """
				214	User-agent: *
				215	Disallow: /some/path?name=value
				216	"""
				217
				218	good = ['/some/path']
				219	bad = ['/some/path?name=value']
				220
				221	RobotTest(14, doc, good, bad)
				222
Georg Brandl	0a0fc07	2010-07-29 17:55:01 +0000	[diff] [blame]	223	# 15. For issue #4108 (obey first * entry)
				224	doc = """
				225	User-agent: *
				226	Disallow: /some/path
				227
				228	User-agent: *
				229	Disallow: /another/path
				230	"""
				231
				232	good = ['/another/path']
				233	bad = ['/some/path']
				234
				235	RobotTest(15, doc, good, bad)
				236
Senthil Kumaran	c70a6ae	2013-05-29 05:54:31 -0700	[diff] [blame]	237	# 16. Empty query (issue #17403). Normalizing the url first.
				238	doc = """
				239	User-agent: *
				240	Allow: /some/path?
				241	Disallow: /another/path?
				242	"""
				243
				244	good = ['/some/path?']
				245	bad = ['/another/path?']
				246
				247	RobotTest(16, doc, good, bad)
				248
Benjamin Peterson	d631371	2008-07-31 16:23:04 +0000	[diff] [blame]	249
Jeremy Hylton	73fd46d	2008-07-18 20:59:44 +0000	[diff] [blame]	250	class NetworkTestCase(unittest.TestCase):
				251
				252	def testPasswordProtectedSite(self):
Florent Xicluna	41fe615	2010-04-02 18:52:12 +0000	[diff] [blame]	253	support.requires('network')
Antoine Pitrou	8bc0903	2010-09-07 21:09:09 +0000	[diff] [blame]	254	with support.transient_internet('mueblesmoraleda.com'):
				255	url = 'http://mueblesmoraleda.com'
Antoine Pitrou	95531ea	2011-07-08 19:43:51 +0200	[diff] [blame]	256	robots_url = url + "/robots.txt"
				257	# First check the URL is usable for our purposes, since the
				258	# test site is a bit flaky.
				259	try:
				260	urlopen(robots_url)
				261	except HTTPError as e:
				262	if e.code not in {401, 403}:
				263	self.skipTest(
				264	"%r should return a 401 or 403 HTTP error, not %r"
				265	% (robots_url, e.code))
				266	else:
				267	self.skipTest(
				268	"%r should return a 401 or 403 HTTP error, not succeed"
				269	% (robots_url))
Antoine Pitrou	8bc0903	2010-09-07 21:09:09 +0000	[diff] [blame]	270	parser = urllib.robotparser.RobotFileParser()
				271	parser.set_url(url)
				272	try:
				273	parser.read()
				274	except URLError:
				275	self.skipTest('%s is unavailable' % url)
Antoine Pitrou	95531ea	2011-07-08 19:43:51 +0200	[diff] [blame]	276	self.assertEqual(parser.can_fetch("*", robots_url), False)
Thomas Wouters	47b49bf	2007-08-30 22:15:33 +0000	[diff] [blame]	277
Jeremy Hylton	73fd46d	2008-07-18 20:59:44 +0000	[diff] [blame]	278	def testPythonOrg(self):
Florent Xicluna	41fe615	2010-04-02 18:52:12 +0000	[diff] [blame]	279	support.requires('network')
Antoine Pitrou	8bc0903	2010-09-07 21:09:09 +0000	[diff] [blame]	280	with support.transient_internet('www.python.org'):
				281	parser = urllib.robotparser.RobotFileParser(
				282	"http://www.python.org/robots.txt")
				283	parser.read()
				284	self.assertTrue(
				285	parser.can_fetch("*", "http://www.python.org/robots.txt"))
Jeremy Hylton	73fd46d	2008-07-18 20:59:44 +0000	[diff] [blame]	286
Ezio Melotti	0fb37ea	2013-03-12 07:49:12 +0200	[diff] [blame]	287	def load_tests(loader, suite, pattern):
				288	suite = unittest.makeSuite(NetworkTestCase)
				289	suite.addTest(tests)
				290	return suite
Martin v. Löwis	1c63f6e	2002-02-28 15:24:47 +0000	[diff] [blame]	291
				292	if __name__=='__main__':
Ezio Melotti	0fb37ea	2013-03-12 07:49:12 +0200	[diff] [blame]	293	support.use_resources = ['network']
				294	unittest.main()