blob: b0bed431d4b059f6e4e5d24de9ebdf43b9389247 [file] [log] [blame]
Guido van Rossum34d19282007-08-09 01:03:29 +00001import io
Berker Peksag2a8d7f12016-09-18 11:21:57 +03002import os
Antoine Pitroua6a4dc82017-09-07 18:56:24 +02003import threading
Jeremy Hylton1afc1692008-06-18 20:49:58 +00004import unittest
5import urllib.robotparser
Benjamin Petersonee8712c2008-05-20 21:35:26 +00006from test import support
Serhiy Storchaka16994912020-04-25 10:06:29 +03007from test.support import socket_helper
Hai Shie80697d2020-05-28 06:10:27 +08008from test.support import threading_helper
Senthil Kumaran601d6ec2014-06-25 02:58:15 -07009from http.server import BaseHTTPRequestHandler, HTTPServer
10
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000011
Berker Peksag4da0fd02016-09-11 14:53:16 +030012class BaseRobotTest:
13 robots_txt = ''
14 agent = 'test_robotparser'
15 good = []
16 bad = []
Christopher Beacham5db5c062018-05-16 07:52:07 -070017 site_maps = None
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000018
Berker Peksag4da0fd02016-09-11 14:53:16 +030019 def setUp(self):
20 lines = io.StringIO(self.robots_txt).readlines()
21 self.parser = urllib.robotparser.RobotFileParser()
22 self.parser.parse(lines)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000023
Berker Peksag4da0fd02016-09-11 14:53:16 +030024 def get_agent_and_url(self, url):
25 if isinstance(url, tuple):
26 agent, url = url
27 return agent, url
28 return self.agent, url
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000029
Berker Peksag4da0fd02016-09-11 14:53:16 +030030 def test_good_urls(self):
31 for url in self.good:
32 agent, url = self.get_agent_and_url(url)
33 with self.subTest(url=url, agent=agent):
34 self.assertTrue(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000035
Berker Peksag4da0fd02016-09-11 14:53:16 +030036 def test_bad_urls(self):
37 for url in self.bad:
38 agent, url = self.get_agent_and_url(url)
39 with self.subTest(url=url, agent=agent):
40 self.assertFalse(self.parser.can_fetch(agent, url))
Tim Peters863ac442002-04-16 01:38:40 +000041
Christopher Beacham5db5c062018-05-16 07:52:07 -070042 def test_site_maps(self):
43 self.assertEqual(self.parser.site_maps(), self.site_maps)
44
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000045
Berker Peksag4da0fd02016-09-11 14:53:16 +030046class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
47 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000048User-agent: *
49Disallow: /cyberworld/map/ # This is an infinite virtual URL space
50Disallow: /tmp/ # these will soon disappear
51Disallow: /foo.html
Berker Peksag4da0fd02016-09-11 14:53:16 +030052 """
53 good = ['/', '/test.html']
54 bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000055
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000056
Berker Peksag4da0fd02016-09-11 14:53:16 +030057class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
58 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000059# robots.txt for http://www.example.com/
60
61User-agent: *
Berker Peksag960e8482015-10-08 12:27:06 +030062Crawl-delay: 1
63Request-rate: 3/15
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000064Disallow: /cyberworld/map/ # This is an infinite virtual URL space
65
66# Cybermapper knows where to go.
67User-agent: cybermapper
68Disallow:
Berker Peksag4da0fd02016-09-11 14:53:16 +030069 """
70 good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
71 bad = ['/cyberworld/map/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000072
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000073
Christopher Beacham5db5c062018-05-16 07:52:07 -070074class SitemapTest(BaseRobotTest, unittest.TestCase):
75 robots_txt = """\
76# robots.txt for http://www.example.com/
77
78User-agent: *
79Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
80Sitemap: http://www.google.com/hostednews/sitemap_index.xml
81Request-rate: 3/15
82Disallow: /cyberworld/map/ # This is an infinite virtual URL space
83
84 """
85 good = ['/', '/test.html']
86 bad = ['/cyberworld/map/index.html']
87 site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
88 'http://www.google.com/hostednews/sitemap_index.xml']
89
90
Berker Peksag4da0fd02016-09-11 14:53:16 +030091class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
92 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000093# go away
94User-agent: *
95Disallow: /
Berker Peksag4da0fd02016-09-11 14:53:16 +030096 """
97 good = []
98 bad = ['/cyberworld/map/index.html', '/', '/tmp/']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000099
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000100
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300101class BaseRequestRateTest(BaseRobotTest):
Rémi Lapeyre8047e0e2019-06-16 08:48:57 +0200102 request_rate = None
103 crawl_delay = None
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300104
105 def test_request_rate(self):
Rémi Lapeyre8047e0e2019-06-16 08:48:57 +0200106 parser = self.parser
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300107 for url in self.good + self.bad:
108 agent, url = self.get_agent_and_url(url)
109 with self.subTest(url=url, agent=agent):
Rémi Lapeyre8047e0e2019-06-16 08:48:57 +0200110 self.assertEqual(parser.crawl_delay(agent), self.crawl_delay)
111
112 parsed_request_rate = parser.request_rate(agent)
113 self.assertEqual(parsed_request_rate, self.request_rate)
114 if self.request_rate is not None:
Berker Peksag3df02db2017-11-24 02:40:26 +0300115 self.assertIsInstance(
Rémi Lapeyre8047e0e2019-06-16 08:48:57 +0200116 parsed_request_rate,
Berker Peksag3df02db2017-11-24 02:40:26 +0300117 urllib.robotparser.RequestRate
118 )
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300119 self.assertEqual(
Rémi Lapeyre8047e0e2019-06-16 08:48:57 +0200120 parsed_request_rate.requests,
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300121 self.request_rate.requests
122 )
123 self.assertEqual(
Rémi Lapeyre8047e0e2019-06-16 08:48:57 +0200124 parsed_request_rate.seconds,
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300125 self.request_rate.seconds
126 )
127
128
Rémi Lapeyre8047e0e2019-06-16 08:48:57 +0200129class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
130 robots_txt = ''
131 good = ['/foo']
132
133
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300134class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
Berker Peksag4da0fd02016-09-11 14:53:16 +0300135 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000136User-agent: figtree
Berker Peksag960e8482015-10-08 12:27:06 +0300137Crawl-delay: 3
138Request-rate: 9/30
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000139Disallow: /tmp
140Disallow: /a%3cd.html
141Disallow: /a%2fb.html
142Disallow: /%7ejoe/index.html
Berker Peksag4da0fd02016-09-11 14:53:16 +0300143 """
144 agent = 'figtree'
Berker Peksag3df02db2017-11-24 02:40:26 +0300145 request_rate = urllib.robotparser.RequestRate(9, 30)
Berker Peksag4da0fd02016-09-11 14:53:16 +0300146 crawl_delay = 3
147 good = [('figtree', '/foo.html')]
148 bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
149 '/a%2fb.html', '/~joe/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000150
Berker Peksag960e8482015-10-08 12:27:06 +0300151
Berker Peksag4da0fd02016-09-11 14:53:16 +0300152class DifferentAgentTest(CrawlDelayAndRequestRateTest):
153 agent = 'FigTree Robot libwww-perl/5.04'
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000154
Berker Peksag4da0fd02016-09-11 14:53:16 +0300155
156class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
157 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000158User-agent: *
159Disallow: /tmp/
160Disallow: /a%3Cd.html
161Disallow: /a/b.html
162Disallow: /%7ejoe/index.html
Berker Peksag960e8482015-10-08 12:27:06 +0300163Crawl-delay: 3
164Request-rate: 9/banana
Berker Peksag4da0fd02016-09-11 14:53:16 +0300165 """
166 good = ['/tmp']
167 bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
168 '/%7Ejoe/index.html']
169 crawl_delay = 3
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000170
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000171
Berker Peksag4da0fd02016-09-11 14:53:16 +0300172class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
173 # From bug report #523041
174 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000175User-Agent: *
176Disallow: /.
Berker Peksag960e8482015-10-08 12:27:06 +0300177Crawl-delay: pears
Berker Peksag4da0fd02016-09-11 14:53:16 +0300178 """
179 good = ['/foo.html']
180 # bug report says "/" should be denied, but that is not in the RFC
181 bad = []
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000182
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000183
Berker Peksag4da0fd02016-09-11 14:53:16 +0300184class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
185 # also test that Allow and Diasallow works well with each other
186 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000187User-agent: Googlebot
188Allow: /folder1/myfile.html
189Disallow: /folder1/
Berker Peksag960e8482015-10-08 12:27:06 +0300190Request-rate: whale/banana
Berker Peksag4da0fd02016-09-11 14:53:16 +0300191 """
192 agent = 'Googlebot'
193 good = ['/folder1/myfile.html']
194 bad = ['/folder1/anotherfile.html']
Benjamin Petersond6313712008-07-31 16:23:04 +0000195
Benjamin Petersond6313712008-07-31 16:23:04 +0000196
Berker Peksag4da0fd02016-09-11 14:53:16 +0300197class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
198 # the order of User-agent should be correct. note
199 # that this file is incorrect because "Googlebot" is a
200 # substring of "Googlebot-Mobile"
201 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000202User-agent: Googlebot
203Disallow: /
204
205User-agent: Googlebot-Mobile
206Allow: /
Berker Peksag4da0fd02016-09-11 14:53:16 +0300207 """
208 agent = 'Googlebot'
209 bad = ['/something.jpg']
Benjamin Petersond6313712008-07-31 16:23:04 +0000210
211
Berker Peksag4da0fd02016-09-11 14:53:16 +0300212class UserAgentGoogleMobileTest(UserAgentOrderingTest):
213 agent = 'Googlebot-Mobile'
214
215
216class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
217 # Google also got the order wrong. You need
218 # to specify the URLs from more specific to more general
219 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000220User-agent: Googlebot
221Allow: /folder1/myfile.html
222Disallow: /folder1/
Berker Peksag4da0fd02016-09-11 14:53:16 +0300223 """
224 agent = 'googlebot'
225 good = ['/folder1/myfile.html']
226 bad = ['/folder1/anotherfile.html']
Benjamin Petersond6313712008-07-31 16:23:04 +0000227
228
Berker Peksag4da0fd02016-09-11 14:53:16 +0300229class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
230 # see issue #6325 for details
231 robots_txt = """\
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000232User-agent: *
233Disallow: /some/path?name=value
Berker Peksag4da0fd02016-09-11 14:53:16 +0300234 """
235 good = ['/some/path']
236 bad = ['/some/path?name=value']
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000237
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000238
Berker Peksag4da0fd02016-09-11 14:53:16 +0300239class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
240 # obey first * entry (#4108)
241 robots_txt = """\
Georg Brandl0a0fc072010-07-29 17:55:01 +0000242User-agent: *
243Disallow: /some/path
244
245User-agent: *
246Disallow: /another/path
Berker Peksag4da0fd02016-09-11 14:53:16 +0300247 """
248 good = ['/another/path']
249 bad = ['/some/path']
Georg Brandl0a0fc072010-07-29 17:55:01 +0000250
Georg Brandl0a0fc072010-07-29 17:55:01 +0000251
Berker Peksag4da0fd02016-09-11 14:53:16 +0300252class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
253 # normalize the URL first (#17403)
254 robots_txt = """\
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700255User-agent: *
256Allow: /some/path?
257Disallow: /another/path?
Berker Peksag4da0fd02016-09-11 14:53:16 +0300258 """
259 good = ['/some/path?']
260 bad = ['/another/path?']
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700261
Benjamin Petersond6313712008-07-31 16:23:04 +0000262
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300263class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
264 robots_txt = """\
265User-agent: *
266Crawl-delay: 1
267Request-rate: 3/15
268Disallow: /cyberworld/map/
269 """
Berker Peksag3df02db2017-11-24 02:40:26 +0300270 request_rate = urllib.robotparser.RequestRate(3, 15)
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300271 crawl_delay = 1
272 good = ['/', '/test.html']
273 bad = ['/cyberworld/map/index.html']
274
275
Michael Lazarbd08a0a2018-05-14 10:10:41 -0400276class StringFormattingTest(BaseRobotTest, unittest.TestCase):
277 robots_txt = """\
278User-agent: *
279Crawl-delay: 1
280Request-rate: 3/15
281Disallow: /cyberworld/map/ # This is an infinite virtual URL space
282
283# Cybermapper knows where to go.
284User-agent: cybermapper
285Disallow: /some/path
286 """
287
288 expected_output = """\
289User-agent: cybermapper
290Disallow: /some/path
291
292User-agent: *
293Crawl-delay: 1
294Request-rate: 3/15
295Disallow: /cyberworld/map/\
296"""
297
298 def test_string_formatting(self):
299 self.assertEqual(str(self.parser), self.expected_output)
300
301
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700302class RobotHandler(BaseHTTPRequestHandler):
303
304 def do_GET(self):
305 self.send_error(403, "Forbidden access")
306
307 def log_message(self, format, *args):
308 pass
309
310
311class PasswordProtectedSiteTestCase(unittest.TestCase):
312
313 def setUp(self):
Victor Stinner7cb92042019-07-02 14:50:19 +0200314 # clear _opener global variable
315 self.addCleanup(urllib.request.urlcleanup)
316
Serhiy Storchaka16994912020-04-25 10:06:29 +0300317 self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler)
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700318
319 self.t = threading.Thread(
320 name='HTTPServer serving',
321 target=self.server.serve_forever,
322 # Short poll interval to make the test finish quickly.
323 # Time between requests is short enough that we won't wake
324 # up spuriously too many times.
325 kwargs={'poll_interval':0.01})
326 self.t.daemon = True # In case this function raises.
327 self.t.start()
328
329 def tearDown(self):
330 self.server.shutdown()
331 self.t.join()
332 self.server.server_close()
333
Hai Shie80697d2020-05-28 06:10:27 +0800334 @threading_helper.reap_threads
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000335 def testPasswordProtectedSite(self):
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700336 addr = self.server.server_address
Serhiy Storchaka16994912020-04-25 10:06:29 +0300337 url = 'http://' + socket_helper.HOST + ':' + str(addr[1])
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700338 robots_url = url + "/robots.txt"
339 parser = urllib.robotparser.RobotFileParser()
340 parser.set_url(url)
341 parser.read()
342 self.assertFalse(parser.can_fetch("*", robots_url))
343
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700344
345class NetworkTestCase(unittest.TestCase):
Thomas Wouters47b49bf2007-08-30 22:15:33 +0000346
Berker Peksag2a8d7f12016-09-18 11:21:57 +0300347 base_url = 'http://www.pythontest.net/'
348 robots_txt = '{}elsewhere/robots.txt'.format(base_url)
349
350 @classmethod
351 def setUpClass(cls):
Florent Xicluna41fe6152010-04-02 18:52:12 +0000352 support.requires('network')
Serhiy Storchakabfb1cf42020-04-29 10:36:20 +0300353 with socket_helper.transient_internet(cls.base_url):
Berker Peksag2a8d7f12016-09-18 11:21:57 +0300354 cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt)
355 cls.parser.read()
356
357 def url(self, path):
358 return '{}{}{}'.format(
359 self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
360 )
361
362 def test_basic(self):
363 self.assertFalse(self.parser.disallow_all)
364 self.assertFalse(self.parser.allow_all)
365 self.assertGreater(self.parser.mtime(), 0)
366 self.assertFalse(self.parser.crawl_delay('*'))
367 self.assertFalse(self.parser.request_rate('*'))
368
369 def test_can_fetch(self):
370 self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
371 self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
372 self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
373 self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
374 self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
375 self.assertTrue(self.parser.can_fetch('*', self.base_url))
376
377 def test_read_404(self):
378 parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt'))
379 parser.read()
380 self.assertTrue(parser.allow_all)
381 self.assertFalse(parser.disallow_all)
382 self.assertEqual(parser.mtime(), 0)
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300383 self.assertIsNone(parser.crawl_delay('*'))
384 self.assertIsNone(parser.request_rate('*'))
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000385
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000386if __name__=='__main__':
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200387 unittest.main()