blob: 76f4f7c6146e0d59830bf945747e72dd27ac8dd7 [file] [log] [blame]
Guido van Rossum34d19282007-08-09 01:03:29 +00001import io
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002import unittest
3import urllib.robotparser
Berker Peksag960e8482015-10-08 12:27:06 +03004from collections import namedtuple
Benjamin Petersonee8712c2008-05-20 21:35:26 +00005from test import support
Senthil Kumaran601d6ec2014-06-25 02:58:15 -07006from http.server import BaseHTTPRequestHandler, HTTPServer
Berker Peksagad324f62014-06-29 15:54:56 +03007try:
8 import threading
9except ImportError:
10 threading = None
Senthil Kumaran601d6ec2014-06-25 02:58:15 -070011
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000012
13class RobotTestCase(unittest.TestCase):
Berker Peksag960e8482015-10-08 12:27:06 +030014 def __init__(self, index=None, parser=None, url=None, good=None,
15 agent=None, request_rate=None, crawl_delay=None):
Ezio Melotti0fb37ea2013-03-12 07:49:12 +020016 # workaround to make unittest discovery work (see #17066)
17 if not isinstance(index, int):
18 return
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000019 unittest.TestCase.__init__(self)
20 if good:
21 self.str = "RobotTest(%d, good, %s)" % (index, url)
22 else:
23 self.str = "RobotTest(%d, bad, %s)" % (index, url)
24 self.parser = parser
25 self.url = url
26 self.good = good
27 self.agent = agent
Berker Peksag960e8482015-10-08 12:27:06 +030028 self.request_rate = request_rate
29 self.crawl_delay = crawl_delay
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000030
31 def runTest(self):
32 if isinstance(self.url, tuple):
33 agent, url = self.url
34 else:
35 url = self.url
36 agent = self.agent
37 if self.good:
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000038 self.assertTrue(self.parser.can_fetch(agent, url))
Berker Peksag960e8482015-10-08 12:27:06 +030039 self.assertEqual(self.parser.crawl_delay(agent), self.crawl_delay)
40 # if we have actual values for request rate
41 if self.request_rate and self.parser.request_rate(agent):
42 self.assertEqual(
43 self.parser.request_rate(agent).requests,
44 self.request_rate.requests
45 )
46 self.assertEqual(
47 self.parser.request_rate(agent).seconds,
48 self.request_rate.seconds
49 )
50 self.assertEqual(self.parser.request_rate(agent), self.request_rate)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000051 else:
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000052 self.assertFalse(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000053
54 def __str__(self):
55 return self.str
56
57tests = unittest.TestSuite()
58
59def RobotTest(index, robots_txt, good_urls, bad_urls,
Berker Peksag960e8482015-10-08 12:27:06 +030060 request_rate, crawl_delay, agent="test_robotparser"):
Tim Peters863ac442002-04-16 01:38:40 +000061
Guido van Rossum34d19282007-08-09 01:03:29 +000062 lines = io.StringIO(robots_txt).readlines()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000063 parser = urllib.robotparser.RobotFileParser()
Tim Peters863ac442002-04-16 01:38:40 +000064 parser.parse(lines)
65 for url in good_urls:
Berker Peksag960e8482015-10-08 12:27:06 +030066 tests.addTest(RobotTestCase(index, parser, url, 1, agent,
67 request_rate, crawl_delay))
Tim Peters863ac442002-04-16 01:38:40 +000068 for url in bad_urls:
Berker Peksag960e8482015-10-08 12:27:06 +030069 tests.addTest(RobotTestCase(index, parser, url, 0, agent,
70 request_rate, crawl_delay))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000071
72# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
73
74# 1.
75doc = """
76User-agent: *
77Disallow: /cyberworld/map/ # This is an infinite virtual URL space
78Disallow: /tmp/ # these will soon disappear
79Disallow: /foo.html
80"""
81
82good = ['/','/test.html']
83bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
Berker Peksag960e8482015-10-08 12:27:06 +030084request_rate = None
85crawl_delay = None
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000086
Berker Peksag960e8482015-10-08 12:27:06 +030087RobotTest(1, doc, good, bad, request_rate, crawl_delay)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000088
89# 2.
90doc = """
91# robots.txt for http://www.example.com/
92
93User-agent: *
Berker Peksag960e8482015-10-08 12:27:06 +030094Crawl-delay: 1
95Request-rate: 3/15
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000096Disallow: /cyberworld/map/ # This is an infinite virtual URL space
97
98# Cybermapper knows where to go.
99User-agent: cybermapper
100Disallow:
101
102"""
103
104good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
105bad = ['/cyberworld/map/index.html']
Berker Peksag960e8482015-10-08 12:27:06 +0300106request_rate = None # The parameters should be equal to None since they
107crawl_delay = None # don't apply to the cybermapper user agent
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000108
Berker Peksag960e8482015-10-08 12:27:06 +0300109RobotTest(2, doc, good, bad, request_rate, crawl_delay)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000110
111# 3.
112doc = """
113# go away
114User-agent: *
115Disallow: /
116"""
117
118good = []
119bad = ['/cyberworld/map/index.html','/','/tmp/']
Berker Peksag960e8482015-10-08 12:27:06 +0300120request_rate = None
121crawl_delay = None
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000122
Berker Peksag960e8482015-10-08 12:27:06 +0300123RobotTest(3, doc, good, bad, request_rate, crawl_delay)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000124
125# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
126
127# 4.
128doc = """
129User-agent: figtree
Berker Peksag960e8482015-10-08 12:27:06 +0300130Crawl-delay: 3
131Request-rate: 9/30
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000132Disallow: /tmp
133Disallow: /a%3cd.html
134Disallow: /a%2fb.html
135Disallow: /%7ejoe/index.html
136"""
137
138good = [] # XFAIL '/a/b.html'
139bad = ['/tmp','/tmp.html','/tmp/a.html',
140 '/a%3cd.html','/a%3Cd.html','/a%2fb.html',
141 '/~joe/index.html'
142 ]
143
Berker Peksag960e8482015-10-08 12:27:06 +0300144request_rate = namedtuple('req_rate', 'requests seconds')
145request_rate.requests = 9
146request_rate.seconds = 30
147crawl_delay = 3
148request_rate_bad = None # not actually tested, but we still need to parse it
149crawl_delay_bad = None # in order to accommodate the input parameters
150
151
152RobotTest(4, doc, good, bad, request_rate, crawl_delay, 'figtree' )
153RobotTest(5, doc, good, bad, request_rate_bad, crawl_delay_bad,
154 'FigTree Robot libwww-perl/5.04')
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000155
156# 6.
157doc = """
158User-agent: *
159Disallow: /tmp/
160Disallow: /a%3Cd.html
161Disallow: /a/b.html
162Disallow: /%7ejoe/index.html
Berker Peksag960e8482015-10-08 12:27:06 +0300163Crawl-delay: 3
164Request-rate: 9/banana
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000165"""
166
167good = ['/tmp',] # XFAIL: '/a%2fb.html'
168bad = ['/tmp/','/tmp/a.html',
169 '/a%3cd.html','/a%3Cd.html',"/a/b.html",
Tim Peters863ac442002-04-16 01:38:40 +0000170 '/%7Ejoe/index.html']
Berker Peksag960e8482015-10-08 12:27:06 +0300171crawl_delay = 3
172request_rate = None # since request rate has invalid syntax, return None
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000173
Berker Peksag960e8482015-10-08 12:27:06 +0300174RobotTest(6, doc, good, bad, None, None)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000175
176# From bug report #523041
177
178# 7.
179doc = """
180User-Agent: *
181Disallow: /.
Berker Peksag960e8482015-10-08 12:27:06 +0300182Crawl-delay: pears
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000183"""
184
185good = ['/foo.html']
Berker Peksag960e8482015-10-08 12:27:06 +0300186bad = [] # bug report says "/" should be denied, but that is not in the RFC
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000187
Berker Peksag960e8482015-10-08 12:27:06 +0300188crawl_delay = None # since crawl delay has invalid syntax, return None
189request_rate = None
190
191RobotTest(7, doc, good, bad, crawl_delay, request_rate)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000192
Benjamin Petersond6313712008-07-31 16:23:04 +0000193# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
194
195# 8.
196doc = """
197User-agent: Googlebot
198Allow: /folder1/myfile.html
199Disallow: /folder1/
Berker Peksag960e8482015-10-08 12:27:06 +0300200Request-rate: whale/banana
Benjamin Petersond6313712008-07-31 16:23:04 +0000201"""
202
203good = ['/folder1/myfile.html']
204bad = ['/folder1/anotherfile.html']
Berker Peksag960e8482015-10-08 12:27:06 +0300205crawl_delay = None
206request_rate = None # invalid syntax, return none
Benjamin Petersond6313712008-07-31 16:23:04 +0000207
Berker Peksag960e8482015-10-08 12:27:06 +0300208RobotTest(8, doc, good, bad, crawl_delay, request_rate, agent="Googlebot")
Benjamin Petersond6313712008-07-31 16:23:04 +0000209
210# 9. This file is incorrect because "Googlebot" is a substring of
211# "Googlebot-Mobile", so test 10 works just like test 9.
212doc = """
213User-agent: Googlebot
214Disallow: /
215
216User-agent: Googlebot-Mobile
217Allow: /
218"""
219
220good = []
221bad = ['/something.jpg']
222
Berker Peksag960e8482015-10-08 12:27:06 +0300223RobotTest(9, doc, good, bad, None, None, agent="Googlebot")
Benjamin Petersond6313712008-07-31 16:23:04 +0000224
225good = []
226bad = ['/something.jpg']
227
Berker Peksag960e8482015-10-08 12:27:06 +0300228RobotTest(10, doc, good, bad, None, None, agent="Googlebot-Mobile")
Benjamin Petersond6313712008-07-31 16:23:04 +0000229
230# 11. Get the order correct.
231doc = """
232User-agent: Googlebot-Mobile
233Allow: /
234
235User-agent: Googlebot
236Disallow: /
237"""
238
239good = []
240bad = ['/something.jpg']
241
Berker Peksag960e8482015-10-08 12:27:06 +0300242RobotTest(11, doc, good, bad, None, None, agent="Googlebot")
Benjamin Petersond6313712008-07-31 16:23:04 +0000243
244good = ['/something.jpg']
245bad = []
246
Berker Peksag960e8482015-10-08 12:27:06 +0300247RobotTest(12, doc, good, bad, None, None, agent="Googlebot-Mobile")
Benjamin Petersond6313712008-07-31 16:23:04 +0000248
249
250# 13. Google also got the order wrong in #8. You need to specify the
251# URLs from more specific to more general.
252doc = """
253User-agent: Googlebot
254Allow: /folder1/myfile.html
255Disallow: /folder1/
256"""
257
258good = ['/folder1/myfile.html']
259bad = ['/folder1/anotherfile.html']
260
Berker Peksag960e8482015-10-08 12:27:06 +0300261RobotTest(13, doc, good, bad, None, None, agent="googlebot")
Benjamin Petersond6313712008-07-31 16:23:04 +0000262
263
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000264# 14. For issue #6325 (query string support)
265doc = """
266User-agent: *
267Disallow: /some/path?name=value
268"""
269
270good = ['/some/path']
271bad = ['/some/path?name=value']
272
Berker Peksag960e8482015-10-08 12:27:06 +0300273RobotTest(14, doc, good, bad, None, None)
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000274
Georg Brandl0a0fc072010-07-29 17:55:01 +0000275# 15. For issue #4108 (obey first * entry)
276doc = """
277User-agent: *
278Disallow: /some/path
279
280User-agent: *
281Disallow: /another/path
282"""
283
284good = ['/another/path']
285bad = ['/some/path']
286
Berker Peksag960e8482015-10-08 12:27:06 +0300287RobotTest(15, doc, good, bad, None, None)
Georg Brandl0a0fc072010-07-29 17:55:01 +0000288
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700289# 16. Empty query (issue #17403). Normalizing the url first.
290doc = """
291User-agent: *
292Allow: /some/path?
293Disallow: /another/path?
294"""
295
296good = ['/some/path?']
297bad = ['/another/path?']
298
Berker Peksag960e8482015-10-08 12:27:06 +0300299RobotTest(16, doc, good, bad, None, None)
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700300
Benjamin Petersond6313712008-07-31 16:23:04 +0000301
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700302class RobotHandler(BaseHTTPRequestHandler):
303
304 def do_GET(self):
305 self.send_error(403, "Forbidden access")
306
307 def log_message(self, format, *args):
308 pass
309
310
Berker Peksagad324f62014-06-29 15:54:56 +0300311@unittest.skipUnless(threading, 'threading required for this test')
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700312class PasswordProtectedSiteTestCase(unittest.TestCase):
313
314 def setUp(self):
315 self.server = HTTPServer((support.HOST, 0), RobotHandler)
316
317 self.t = threading.Thread(
318 name='HTTPServer serving',
319 target=self.server.serve_forever,
320 # Short poll interval to make the test finish quickly.
321 # Time between requests is short enough that we won't wake
322 # up spuriously too many times.
323 kwargs={'poll_interval':0.01})
324 self.t.daemon = True # In case this function raises.
325 self.t.start()
326
327 def tearDown(self):
328 self.server.shutdown()
329 self.t.join()
330 self.server.server_close()
331
332 def runTest(self):
333 self.testPasswordProtectedSite()
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000334
335 def testPasswordProtectedSite(self):
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700336 addr = self.server.server_address
337 url = 'http://' + support.HOST + ':' + str(addr[1])
338 robots_url = url + "/robots.txt"
339 parser = urllib.robotparser.RobotFileParser()
340 parser.set_url(url)
341 parser.read()
342 self.assertFalse(parser.can_fetch("*", robots_url))
343
344 def __str__(self):
345 return '%s' % self.__class__.__name__
346
347class NetworkTestCase(unittest.TestCase):
Thomas Wouters47b49bf2007-08-30 22:15:33 +0000348
Georg Brandl89e56712014-02-23 08:45:15 +0100349 @unittest.skip('does not handle the gzip encoding delivered by pydotorg')
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000350 def testPythonOrg(self):
Florent Xicluna41fe6152010-04-02 18:52:12 +0000351 support.requires('network')
Antoine Pitrou8bc09032010-09-07 21:09:09 +0000352 with support.transient_internet('www.python.org'):
353 parser = urllib.robotparser.RobotFileParser(
354 "http://www.python.org/robots.txt")
355 parser.read()
356 self.assertTrue(
357 parser.can_fetch("*", "http://www.python.org/robots.txt"))
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000358
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200359def load_tests(loader, suite, pattern):
360 suite = unittest.makeSuite(NetworkTestCase)
361 suite.addTest(tests)
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700362 suite.addTest(PasswordProtectedSiteTestCase())
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200363 return suite
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000364
365if __name__=='__main__':
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200366 unittest.main()