blob: 9d4764ece2fd27d192e23a74b979f108058114d6 [file] [log] [blame]
Guido van Rossum34d19282007-08-09 01:03:29 +00001import io
Berker Peksag2a8d7f12016-09-18 11:21:57 +03002import os
Antoine Pitroua6a4dc82017-09-07 18:56:24 +02003import threading
Jeremy Hylton1afc1692008-06-18 20:49:58 +00004import unittest
5import urllib.robotparser
Benjamin Petersonee8712c2008-05-20 21:35:26 +00006from test import support
Serhiy Storchaka16994912020-04-25 10:06:29 +03007from test.support import socket_helper
Senthil Kumaran601d6ec2014-06-25 02:58:15 -07008from http.server import BaseHTTPRequestHandler, HTTPServer
9
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000010
Berker Peksag4da0fd02016-09-11 14:53:16 +030011class BaseRobotTest:
12 robots_txt = ''
13 agent = 'test_robotparser'
14 good = []
15 bad = []
Christopher Beacham5db5c062018-05-16 07:52:07 -070016 site_maps = None
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000017
Berker Peksag4da0fd02016-09-11 14:53:16 +030018 def setUp(self):
19 lines = io.StringIO(self.robots_txt).readlines()
20 self.parser = urllib.robotparser.RobotFileParser()
21 self.parser.parse(lines)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000022
Berker Peksag4da0fd02016-09-11 14:53:16 +030023 def get_agent_and_url(self, url):
24 if isinstance(url, tuple):
25 agent, url = url
26 return agent, url
27 return self.agent, url
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000028
Berker Peksag4da0fd02016-09-11 14:53:16 +030029 def test_good_urls(self):
30 for url in self.good:
31 agent, url = self.get_agent_and_url(url)
32 with self.subTest(url=url, agent=agent):
33 self.assertTrue(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000034
Berker Peksag4da0fd02016-09-11 14:53:16 +030035 def test_bad_urls(self):
36 for url in self.bad:
37 agent, url = self.get_agent_and_url(url)
38 with self.subTest(url=url, agent=agent):
39 self.assertFalse(self.parser.can_fetch(agent, url))
Tim Peters863ac442002-04-16 01:38:40 +000040
Christopher Beacham5db5c062018-05-16 07:52:07 -070041 def test_site_maps(self):
42 self.assertEqual(self.parser.site_maps(), self.site_maps)
43
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000044
Berker Peksag4da0fd02016-09-11 14:53:16 +030045class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
46 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000047User-agent: *
48Disallow: /cyberworld/map/ # This is an infinite virtual URL space
49Disallow: /tmp/ # these will soon disappear
50Disallow: /foo.html
Berker Peksag4da0fd02016-09-11 14:53:16 +030051 """
52 good = ['/', '/test.html']
53 bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000054
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000055
Berker Peksag4da0fd02016-09-11 14:53:16 +030056class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
57 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000058# robots.txt for http://www.example.com/
59
60User-agent: *
Berker Peksag960e8482015-10-08 12:27:06 +030061Crawl-delay: 1
62Request-rate: 3/15
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000063Disallow: /cyberworld/map/ # This is an infinite virtual URL space
64
65# Cybermapper knows where to go.
66User-agent: cybermapper
67Disallow:
Berker Peksag4da0fd02016-09-11 14:53:16 +030068 """
69 good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
70 bad = ['/cyberworld/map/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000071
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000072
Christopher Beacham5db5c062018-05-16 07:52:07 -070073class SitemapTest(BaseRobotTest, unittest.TestCase):
74 robots_txt = """\
75# robots.txt for http://www.example.com/
76
77User-agent: *
78Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
79Sitemap: http://www.google.com/hostednews/sitemap_index.xml
80Request-rate: 3/15
81Disallow: /cyberworld/map/ # This is an infinite virtual URL space
82
83 """
84 good = ['/', '/test.html']
85 bad = ['/cyberworld/map/index.html']
86 site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
87 'http://www.google.com/hostednews/sitemap_index.xml']
88
89
Berker Peksag4da0fd02016-09-11 14:53:16 +030090class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
91 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000092# go away
93User-agent: *
94Disallow: /
Berker Peksag4da0fd02016-09-11 14:53:16 +030095 """
96 good = []
97 bad = ['/cyberworld/map/index.html', '/', '/tmp/']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000098
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000099
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300100class BaseRequestRateTest(BaseRobotTest):
Rémi Lapeyre8047e0e2019-06-16 08:48:57 +0200101 request_rate = None
102 crawl_delay = None
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300103
104 def test_request_rate(self):
Rémi Lapeyre8047e0e2019-06-16 08:48:57 +0200105 parser = self.parser
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300106 for url in self.good + self.bad:
107 agent, url = self.get_agent_and_url(url)
108 with self.subTest(url=url, agent=agent):
Rémi Lapeyre8047e0e2019-06-16 08:48:57 +0200109 self.assertEqual(parser.crawl_delay(agent), self.crawl_delay)
110
111 parsed_request_rate = parser.request_rate(agent)
112 self.assertEqual(parsed_request_rate, self.request_rate)
113 if self.request_rate is not None:
Berker Peksag3df02db2017-11-24 02:40:26 +0300114 self.assertIsInstance(
Rémi Lapeyre8047e0e2019-06-16 08:48:57 +0200115 parsed_request_rate,
Berker Peksag3df02db2017-11-24 02:40:26 +0300116 urllib.robotparser.RequestRate
117 )
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300118 self.assertEqual(
Rémi Lapeyre8047e0e2019-06-16 08:48:57 +0200119 parsed_request_rate.requests,
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300120 self.request_rate.requests
121 )
122 self.assertEqual(
Rémi Lapeyre8047e0e2019-06-16 08:48:57 +0200123 parsed_request_rate.seconds,
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300124 self.request_rate.seconds
125 )
126
127
Rémi Lapeyre8047e0e2019-06-16 08:48:57 +0200128class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
129 robots_txt = ''
130 good = ['/foo']
131
132
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300133class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
Berker Peksag4da0fd02016-09-11 14:53:16 +0300134 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000135User-agent: figtree
Berker Peksag960e8482015-10-08 12:27:06 +0300136Crawl-delay: 3
137Request-rate: 9/30
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000138Disallow: /tmp
139Disallow: /a%3cd.html
140Disallow: /a%2fb.html
141Disallow: /%7ejoe/index.html
Berker Peksag4da0fd02016-09-11 14:53:16 +0300142 """
143 agent = 'figtree'
Berker Peksag3df02db2017-11-24 02:40:26 +0300144 request_rate = urllib.robotparser.RequestRate(9, 30)
Berker Peksag4da0fd02016-09-11 14:53:16 +0300145 crawl_delay = 3
146 good = [('figtree', '/foo.html')]
147 bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
148 '/a%2fb.html', '/~joe/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000149
Berker Peksag960e8482015-10-08 12:27:06 +0300150
Berker Peksag4da0fd02016-09-11 14:53:16 +0300151class DifferentAgentTest(CrawlDelayAndRequestRateTest):
152 agent = 'FigTree Robot libwww-perl/5.04'
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000153
Berker Peksag4da0fd02016-09-11 14:53:16 +0300154
155class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
156 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000157User-agent: *
158Disallow: /tmp/
159Disallow: /a%3Cd.html
160Disallow: /a/b.html
161Disallow: /%7ejoe/index.html
Berker Peksag960e8482015-10-08 12:27:06 +0300162Crawl-delay: 3
163Request-rate: 9/banana
Berker Peksag4da0fd02016-09-11 14:53:16 +0300164 """
165 good = ['/tmp']
166 bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
167 '/%7Ejoe/index.html']
168 crawl_delay = 3
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000169
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000170
Berker Peksag4da0fd02016-09-11 14:53:16 +0300171class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
172 # From bug report #523041
173 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000174User-Agent: *
175Disallow: /.
Berker Peksag960e8482015-10-08 12:27:06 +0300176Crawl-delay: pears
Berker Peksag4da0fd02016-09-11 14:53:16 +0300177 """
178 good = ['/foo.html']
179 # bug report says "/" should be denied, but that is not in the RFC
180 bad = []
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000181
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000182
Berker Peksag4da0fd02016-09-11 14:53:16 +0300183class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
184 # also test that Allow and Diasallow works well with each other
185 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000186User-agent: Googlebot
187Allow: /folder1/myfile.html
188Disallow: /folder1/
Berker Peksag960e8482015-10-08 12:27:06 +0300189Request-rate: whale/banana
Berker Peksag4da0fd02016-09-11 14:53:16 +0300190 """
191 agent = 'Googlebot'
192 good = ['/folder1/myfile.html']
193 bad = ['/folder1/anotherfile.html']
Benjamin Petersond6313712008-07-31 16:23:04 +0000194
Benjamin Petersond6313712008-07-31 16:23:04 +0000195
Berker Peksag4da0fd02016-09-11 14:53:16 +0300196class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
197 # the order of User-agent should be correct. note
198 # that this file is incorrect because "Googlebot" is a
199 # substring of "Googlebot-Mobile"
200 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000201User-agent: Googlebot
202Disallow: /
203
204User-agent: Googlebot-Mobile
205Allow: /
Berker Peksag4da0fd02016-09-11 14:53:16 +0300206 """
207 agent = 'Googlebot'
208 bad = ['/something.jpg']
Benjamin Petersond6313712008-07-31 16:23:04 +0000209
210
Berker Peksag4da0fd02016-09-11 14:53:16 +0300211class UserAgentGoogleMobileTest(UserAgentOrderingTest):
212 agent = 'Googlebot-Mobile'
213
214
215class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
216 # Google also got the order wrong. You need
217 # to specify the URLs from more specific to more general
218 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000219User-agent: Googlebot
220Allow: /folder1/myfile.html
221Disallow: /folder1/
Berker Peksag4da0fd02016-09-11 14:53:16 +0300222 """
223 agent = 'googlebot'
224 good = ['/folder1/myfile.html']
225 bad = ['/folder1/anotherfile.html']
Benjamin Petersond6313712008-07-31 16:23:04 +0000226
227
Berker Peksag4da0fd02016-09-11 14:53:16 +0300228class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
229 # see issue #6325 for details
230 robots_txt = """\
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000231User-agent: *
232Disallow: /some/path?name=value
Berker Peksag4da0fd02016-09-11 14:53:16 +0300233 """
234 good = ['/some/path']
235 bad = ['/some/path?name=value']
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000236
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000237
Berker Peksag4da0fd02016-09-11 14:53:16 +0300238class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
239 # obey first * entry (#4108)
240 robots_txt = """\
Georg Brandl0a0fc072010-07-29 17:55:01 +0000241User-agent: *
242Disallow: /some/path
243
244User-agent: *
245Disallow: /another/path
Berker Peksag4da0fd02016-09-11 14:53:16 +0300246 """
247 good = ['/another/path']
248 bad = ['/some/path']
Georg Brandl0a0fc072010-07-29 17:55:01 +0000249
Georg Brandl0a0fc072010-07-29 17:55:01 +0000250
Berker Peksag4da0fd02016-09-11 14:53:16 +0300251class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
252 # normalize the URL first (#17403)
253 robots_txt = """\
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700254User-agent: *
255Allow: /some/path?
256Disallow: /another/path?
Berker Peksag4da0fd02016-09-11 14:53:16 +0300257 """
258 good = ['/some/path?']
259 bad = ['/another/path?']
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700260
Benjamin Petersond6313712008-07-31 16:23:04 +0000261
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300262class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
263 robots_txt = """\
264User-agent: *
265Crawl-delay: 1
266Request-rate: 3/15
267Disallow: /cyberworld/map/
268 """
Berker Peksag3df02db2017-11-24 02:40:26 +0300269 request_rate = urllib.robotparser.RequestRate(3, 15)
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300270 crawl_delay = 1
271 good = ['/', '/test.html']
272 bad = ['/cyberworld/map/index.html']
273
274
Michael Lazarbd08a0a2018-05-14 10:10:41 -0400275class StringFormattingTest(BaseRobotTest, unittest.TestCase):
276 robots_txt = """\
277User-agent: *
278Crawl-delay: 1
279Request-rate: 3/15
280Disallow: /cyberworld/map/ # This is an infinite virtual URL space
281
282# Cybermapper knows where to go.
283User-agent: cybermapper
284Disallow: /some/path
285 """
286
287 expected_output = """\
288User-agent: cybermapper
289Disallow: /some/path
290
291User-agent: *
292Crawl-delay: 1
293Request-rate: 3/15
294Disallow: /cyberworld/map/\
295"""
296
297 def test_string_formatting(self):
298 self.assertEqual(str(self.parser), self.expected_output)
299
300
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700301class RobotHandler(BaseHTTPRequestHandler):
302
303 def do_GET(self):
304 self.send_error(403, "Forbidden access")
305
306 def log_message(self, format, *args):
307 pass
308
309
310class PasswordProtectedSiteTestCase(unittest.TestCase):
311
312 def setUp(self):
Victor Stinner7cb92042019-07-02 14:50:19 +0200313 # clear _opener global variable
314 self.addCleanup(urllib.request.urlcleanup)
315
Serhiy Storchaka16994912020-04-25 10:06:29 +0300316 self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler)
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700317
318 self.t = threading.Thread(
319 name='HTTPServer serving',
320 target=self.server.serve_forever,
321 # Short poll interval to make the test finish quickly.
322 # Time between requests is short enough that we won't wake
323 # up spuriously too many times.
324 kwargs={'poll_interval':0.01})
325 self.t.daemon = True # In case this function raises.
326 self.t.start()
327
328 def tearDown(self):
329 self.server.shutdown()
330 self.t.join()
331 self.server.server_close()
332
Berker Peksag2a9f5ed2016-09-11 15:17:53 +0300333 @support.reap_threads
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000334 def testPasswordProtectedSite(self):
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700335 addr = self.server.server_address
Serhiy Storchaka16994912020-04-25 10:06:29 +0300336 url = 'http://' + socket_helper.HOST + ':' + str(addr[1])
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700337 robots_url = url + "/robots.txt"
338 parser = urllib.robotparser.RobotFileParser()
339 parser.set_url(url)
340 parser.read()
341 self.assertFalse(parser.can_fetch("*", robots_url))
342
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700343
344class NetworkTestCase(unittest.TestCase):
Thomas Wouters47b49bf2007-08-30 22:15:33 +0000345
Berker Peksag2a8d7f12016-09-18 11:21:57 +0300346 base_url = 'http://www.pythontest.net/'
347 robots_txt = '{}elsewhere/robots.txt'.format(base_url)
348
349 @classmethod
350 def setUpClass(cls):
Florent Xicluna41fe6152010-04-02 18:52:12 +0000351 support.requires('network')
Berker Peksag2a8d7f12016-09-18 11:21:57 +0300352 with support.transient_internet(cls.base_url):
353 cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt)
354 cls.parser.read()
355
356 def url(self, path):
357 return '{}{}{}'.format(
358 self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
359 )
360
361 def test_basic(self):
362 self.assertFalse(self.parser.disallow_all)
363 self.assertFalse(self.parser.allow_all)
364 self.assertGreater(self.parser.mtime(), 0)
365 self.assertFalse(self.parser.crawl_delay('*'))
366 self.assertFalse(self.parser.request_rate('*'))
367
368 def test_can_fetch(self):
369 self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
370 self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
371 self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
372 self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
373 self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
374 self.assertTrue(self.parser.can_fetch('*', self.base_url))
375
376 def test_read_404(self):
377 parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt'))
378 parser.read()
379 self.assertTrue(parser.allow_all)
380 self.assertFalse(parser.disallow_all)
381 self.assertEqual(parser.mtime(), 0)
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300382 self.assertIsNone(parser.crawl_delay('*'))
383 self.assertIsNone(parser.request_rate('*'))
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000384
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000385if __name__=='__main__':
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200386 unittest.main()