blob: 84a267ad9567ee7c991918581958d0dbbdbf4392 [file] [log] [blame]
Guido van Rossum34d19282007-08-09 01:03:29 +00001import io
Berker Peksag2a8d7f12016-09-18 11:21:57 +03002import os
Antoine Pitroua6a4dc82017-09-07 18:56:24 +02003import threading
Jeremy Hylton1afc1692008-06-18 20:49:58 +00004import unittest
5import urllib.robotparser
Benjamin Petersonee8712c2008-05-20 21:35:26 +00006from test import support
Senthil Kumaran601d6ec2014-06-25 02:58:15 -07007from http.server import BaseHTTPRequestHandler, HTTPServer
8
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +00009
Berker Peksag4da0fd02016-09-11 14:53:16 +030010class BaseRobotTest:
11 robots_txt = ''
12 agent = 'test_robotparser'
13 good = []
14 bad = []
Christopher Beacham5db5c062018-05-16 07:52:07 -070015 site_maps = None
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000016
Berker Peksag4da0fd02016-09-11 14:53:16 +030017 def setUp(self):
18 lines = io.StringIO(self.robots_txt).readlines()
19 self.parser = urllib.robotparser.RobotFileParser()
20 self.parser.parse(lines)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000021
Berker Peksag4da0fd02016-09-11 14:53:16 +030022 def get_agent_and_url(self, url):
23 if isinstance(url, tuple):
24 agent, url = url
25 return agent, url
26 return self.agent, url
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000027
Berker Peksag4da0fd02016-09-11 14:53:16 +030028 def test_good_urls(self):
29 for url in self.good:
30 agent, url = self.get_agent_and_url(url)
31 with self.subTest(url=url, agent=agent):
32 self.assertTrue(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000033
Berker Peksag4da0fd02016-09-11 14:53:16 +030034 def test_bad_urls(self):
35 for url in self.bad:
36 agent, url = self.get_agent_and_url(url)
37 with self.subTest(url=url, agent=agent):
38 self.assertFalse(self.parser.can_fetch(agent, url))
Tim Peters863ac442002-04-16 01:38:40 +000039
Christopher Beacham5db5c062018-05-16 07:52:07 -070040 def test_site_maps(self):
41 self.assertEqual(self.parser.site_maps(), self.site_maps)
42
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000043
Berker Peksag4da0fd02016-09-11 14:53:16 +030044class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
45 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000046User-agent: *
47Disallow: /cyberworld/map/ # This is an infinite virtual URL space
48Disallow: /tmp/ # these will soon disappear
49Disallow: /foo.html
Berker Peksag4da0fd02016-09-11 14:53:16 +030050 """
51 good = ['/', '/test.html']
52 bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000053
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000054
Berker Peksag4da0fd02016-09-11 14:53:16 +030055class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
56 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000057# robots.txt for http://www.example.com/
58
59User-agent: *
Berker Peksag960e8482015-10-08 12:27:06 +030060Crawl-delay: 1
61Request-rate: 3/15
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000062Disallow: /cyberworld/map/ # This is an infinite virtual URL space
63
64# Cybermapper knows where to go.
65User-agent: cybermapper
66Disallow:
Berker Peksag4da0fd02016-09-11 14:53:16 +030067 """
68 good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
69 bad = ['/cyberworld/map/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000070
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000071
Christopher Beacham5db5c062018-05-16 07:52:07 -070072class SitemapTest(BaseRobotTest, unittest.TestCase):
73 robots_txt = """\
74# robots.txt for http://www.example.com/
75
76User-agent: *
77Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
78Sitemap: http://www.google.com/hostednews/sitemap_index.xml
79Request-rate: 3/15
80Disallow: /cyberworld/map/ # This is an infinite virtual URL space
81
82 """
83 good = ['/', '/test.html']
84 bad = ['/cyberworld/map/index.html']
85 site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
86 'http://www.google.com/hostednews/sitemap_index.xml']
87
88
Berker Peksag4da0fd02016-09-11 14:53:16 +030089class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
90 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000091# go away
92User-agent: *
93Disallow: /
Berker Peksag4da0fd02016-09-11 14:53:16 +030094 """
95 good = []
96 bad = ['/cyberworld/map/index.html', '/', '/tmp/']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000097
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000098
Berker Peksag9a7bbb22016-09-18 20:17:58 +030099class BaseRequestRateTest(BaseRobotTest):
100
101 def test_request_rate(self):
102 for url in self.good + self.bad:
103 agent, url = self.get_agent_and_url(url)
104 with self.subTest(url=url, agent=agent):
105 if self.crawl_delay:
106 self.assertEqual(
107 self.parser.crawl_delay(agent), self.crawl_delay
108 )
109 if self.request_rate:
Berker Peksag3df02db2017-11-24 02:40:26 +0300110 self.assertIsInstance(
111 self.parser.request_rate(agent),
112 urllib.robotparser.RequestRate
113 )
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300114 self.assertEqual(
115 self.parser.request_rate(agent).requests,
116 self.request_rate.requests
117 )
118 self.assertEqual(
119 self.parser.request_rate(agent).seconds,
120 self.request_rate.seconds
121 )
122
123
124class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
Berker Peksag4da0fd02016-09-11 14:53:16 +0300125 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000126User-agent: figtree
Berker Peksag960e8482015-10-08 12:27:06 +0300127Crawl-delay: 3
128Request-rate: 9/30
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000129Disallow: /tmp
130Disallow: /a%3cd.html
131Disallow: /a%2fb.html
132Disallow: /%7ejoe/index.html
Berker Peksag4da0fd02016-09-11 14:53:16 +0300133 """
134 agent = 'figtree'
Berker Peksag3df02db2017-11-24 02:40:26 +0300135 request_rate = urllib.robotparser.RequestRate(9, 30)
Berker Peksag4da0fd02016-09-11 14:53:16 +0300136 crawl_delay = 3
137 good = [('figtree', '/foo.html')]
138 bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
139 '/a%2fb.html', '/~joe/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000140
Berker Peksag960e8482015-10-08 12:27:06 +0300141
Berker Peksag4da0fd02016-09-11 14:53:16 +0300142class DifferentAgentTest(CrawlDelayAndRequestRateTest):
143 agent = 'FigTree Robot libwww-perl/5.04'
144 # these are not actually tested, but we still need to parse it
145 # in order to accommodate the input parameters
146 request_rate = None
147 crawl_delay = None
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000148
Berker Peksag4da0fd02016-09-11 14:53:16 +0300149
150class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
151 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000152User-agent: *
153Disallow: /tmp/
154Disallow: /a%3Cd.html
155Disallow: /a/b.html
156Disallow: /%7ejoe/index.html
Berker Peksag960e8482015-10-08 12:27:06 +0300157Crawl-delay: 3
158Request-rate: 9/banana
Berker Peksag4da0fd02016-09-11 14:53:16 +0300159 """
160 good = ['/tmp']
161 bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
162 '/%7Ejoe/index.html']
163 crawl_delay = 3
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000164
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000165
Berker Peksag4da0fd02016-09-11 14:53:16 +0300166class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
167 # From bug report #523041
168 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000169User-Agent: *
170Disallow: /.
Berker Peksag960e8482015-10-08 12:27:06 +0300171Crawl-delay: pears
Berker Peksag4da0fd02016-09-11 14:53:16 +0300172 """
173 good = ['/foo.html']
174 # bug report says "/" should be denied, but that is not in the RFC
175 bad = []
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000176
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000177
Berker Peksag4da0fd02016-09-11 14:53:16 +0300178class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
179 # also test that Allow and Diasallow works well with each other
180 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000181User-agent: Googlebot
182Allow: /folder1/myfile.html
183Disallow: /folder1/
Berker Peksag960e8482015-10-08 12:27:06 +0300184Request-rate: whale/banana
Berker Peksag4da0fd02016-09-11 14:53:16 +0300185 """
186 agent = 'Googlebot'
187 good = ['/folder1/myfile.html']
188 bad = ['/folder1/anotherfile.html']
Benjamin Petersond6313712008-07-31 16:23:04 +0000189
Benjamin Petersond6313712008-07-31 16:23:04 +0000190
Berker Peksag4da0fd02016-09-11 14:53:16 +0300191class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
192 # the order of User-agent should be correct. note
193 # that this file is incorrect because "Googlebot" is a
194 # substring of "Googlebot-Mobile"
195 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000196User-agent: Googlebot
197Disallow: /
198
199User-agent: Googlebot-Mobile
200Allow: /
Berker Peksag4da0fd02016-09-11 14:53:16 +0300201 """
202 agent = 'Googlebot'
203 bad = ['/something.jpg']
Benjamin Petersond6313712008-07-31 16:23:04 +0000204
205
Berker Peksag4da0fd02016-09-11 14:53:16 +0300206class UserAgentGoogleMobileTest(UserAgentOrderingTest):
207 agent = 'Googlebot-Mobile'
208
209
210class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
211 # Google also got the order wrong. You need
212 # to specify the URLs from more specific to more general
213 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000214User-agent: Googlebot
215Allow: /folder1/myfile.html
216Disallow: /folder1/
Berker Peksag4da0fd02016-09-11 14:53:16 +0300217 """
218 agent = 'googlebot'
219 good = ['/folder1/myfile.html']
220 bad = ['/folder1/anotherfile.html']
Benjamin Petersond6313712008-07-31 16:23:04 +0000221
222
Berker Peksag4da0fd02016-09-11 14:53:16 +0300223class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
224 # see issue #6325 for details
225 robots_txt = """\
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000226User-agent: *
227Disallow: /some/path?name=value
Berker Peksag4da0fd02016-09-11 14:53:16 +0300228 """
229 good = ['/some/path']
230 bad = ['/some/path?name=value']
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000231
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000232
Berker Peksag4da0fd02016-09-11 14:53:16 +0300233class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
234 # obey first * entry (#4108)
235 robots_txt = """\
Georg Brandl0a0fc072010-07-29 17:55:01 +0000236User-agent: *
237Disallow: /some/path
238
239User-agent: *
240Disallow: /another/path
Berker Peksag4da0fd02016-09-11 14:53:16 +0300241 """
242 good = ['/another/path']
243 bad = ['/some/path']
Georg Brandl0a0fc072010-07-29 17:55:01 +0000244
Georg Brandl0a0fc072010-07-29 17:55:01 +0000245
Berker Peksag4da0fd02016-09-11 14:53:16 +0300246class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
247 # normalize the URL first (#17403)
248 robots_txt = """\
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700249User-agent: *
250Allow: /some/path?
251Disallow: /another/path?
Berker Peksag4da0fd02016-09-11 14:53:16 +0300252 """
253 good = ['/some/path?']
254 bad = ['/another/path?']
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700255
Benjamin Petersond6313712008-07-31 16:23:04 +0000256
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300257class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
258 robots_txt = """\
259User-agent: *
260Crawl-delay: 1
261Request-rate: 3/15
262Disallow: /cyberworld/map/
263 """
Berker Peksag3df02db2017-11-24 02:40:26 +0300264 request_rate = urllib.robotparser.RequestRate(3, 15)
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300265 crawl_delay = 1
266 good = ['/', '/test.html']
267 bad = ['/cyberworld/map/index.html']
268
269
Michael Lazarbd08a0a2018-05-14 10:10:41 -0400270class StringFormattingTest(BaseRobotTest, unittest.TestCase):
271 robots_txt = """\
272User-agent: *
273Crawl-delay: 1
274Request-rate: 3/15
275Disallow: /cyberworld/map/ # This is an infinite virtual URL space
276
277# Cybermapper knows where to go.
278User-agent: cybermapper
279Disallow: /some/path
280 """
281
282 expected_output = """\
283User-agent: cybermapper
284Disallow: /some/path
285
286User-agent: *
287Crawl-delay: 1
288Request-rate: 3/15
289Disallow: /cyberworld/map/\
290"""
291
292 def test_string_formatting(self):
293 self.assertEqual(str(self.parser), self.expected_output)
294
295
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700296class RobotHandler(BaseHTTPRequestHandler):
297
298 def do_GET(self):
299 self.send_error(403, "Forbidden access")
300
301 def log_message(self, format, *args):
302 pass
303
304
305class PasswordProtectedSiteTestCase(unittest.TestCase):
306
307 def setUp(self):
308 self.server = HTTPServer((support.HOST, 0), RobotHandler)
309
310 self.t = threading.Thread(
311 name='HTTPServer serving',
312 target=self.server.serve_forever,
313 # Short poll interval to make the test finish quickly.
314 # Time between requests is short enough that we won't wake
315 # up spuriously too many times.
316 kwargs={'poll_interval':0.01})
317 self.t.daemon = True # In case this function raises.
318 self.t.start()
319
320 def tearDown(self):
321 self.server.shutdown()
322 self.t.join()
323 self.server.server_close()
324
Berker Peksag2a9f5ed2016-09-11 15:17:53 +0300325 @support.reap_threads
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000326 def testPasswordProtectedSite(self):
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700327 addr = self.server.server_address
328 url = 'http://' + support.HOST + ':' + str(addr[1])
329 robots_url = url + "/robots.txt"
330 parser = urllib.robotparser.RobotFileParser()
331 parser.set_url(url)
332 parser.read()
333 self.assertFalse(parser.can_fetch("*", robots_url))
334
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700335
336class NetworkTestCase(unittest.TestCase):
Thomas Wouters47b49bf2007-08-30 22:15:33 +0000337
Berker Peksag2a8d7f12016-09-18 11:21:57 +0300338 base_url = 'http://www.pythontest.net/'
339 robots_txt = '{}elsewhere/robots.txt'.format(base_url)
340
341 @classmethod
342 def setUpClass(cls):
Florent Xicluna41fe6152010-04-02 18:52:12 +0000343 support.requires('network')
Berker Peksag2a8d7f12016-09-18 11:21:57 +0300344 with support.transient_internet(cls.base_url):
345 cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt)
346 cls.parser.read()
347
348 def url(self, path):
349 return '{}{}{}'.format(
350 self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
351 )
352
353 def test_basic(self):
354 self.assertFalse(self.parser.disallow_all)
355 self.assertFalse(self.parser.allow_all)
356 self.assertGreater(self.parser.mtime(), 0)
357 self.assertFalse(self.parser.crawl_delay('*'))
358 self.assertFalse(self.parser.request_rate('*'))
359
360 def test_can_fetch(self):
361 self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
362 self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
363 self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
364 self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
365 self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
366 self.assertTrue(self.parser.can_fetch('*', self.base_url))
367
368 def test_read_404(self):
369 parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt'))
370 parser.read()
371 self.assertTrue(parser.allow_all)
372 self.assertFalse(parser.disallow_all)
373 self.assertEqual(parser.mtime(), 0)
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300374 self.assertIsNone(parser.crawl_delay('*'))
375 self.assertIsNone(parser.request_rate('*'))
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000376
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000377if __name__=='__main__':
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200378 unittest.main()