blob: 77cd7c4d29dfe6f5f12ee3bc0578967db759a12d [file] [log] [blame]
Guido van Rossum34d19282007-08-09 01:03:29 +00001import io
Berker Peksag2a8d7f12016-09-18 11:21:57 +03002import os
Antoine Pitroua6a4dc82017-09-07 18:56:24 +02003import threading
Jeremy Hylton1afc1692008-06-18 20:49:58 +00004import unittest
5import urllib.robotparser
Benjamin Petersonee8712c2008-05-20 21:35:26 +00006from test import support
Senthil Kumaran601d6ec2014-06-25 02:58:15 -07007from http.server import BaseHTTPRequestHandler, HTTPServer
8
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +00009
Berker Peksag4da0fd02016-09-11 14:53:16 +030010class BaseRobotTest:
11 robots_txt = ''
12 agent = 'test_robotparser'
13 good = []
14 bad = []
Christopher Beacham5db5c062018-05-16 07:52:07 -070015 site_maps = None
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000016
Berker Peksag4da0fd02016-09-11 14:53:16 +030017 def setUp(self):
18 lines = io.StringIO(self.robots_txt).readlines()
19 self.parser = urllib.robotparser.RobotFileParser()
20 self.parser.parse(lines)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000021
Berker Peksag4da0fd02016-09-11 14:53:16 +030022 def get_agent_and_url(self, url):
23 if isinstance(url, tuple):
24 agent, url = url
25 return agent, url
26 return self.agent, url
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000027
Berker Peksag4da0fd02016-09-11 14:53:16 +030028 def test_good_urls(self):
29 for url in self.good:
30 agent, url = self.get_agent_and_url(url)
31 with self.subTest(url=url, agent=agent):
32 self.assertTrue(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000033
Berker Peksag4da0fd02016-09-11 14:53:16 +030034 def test_bad_urls(self):
35 for url in self.bad:
36 agent, url = self.get_agent_and_url(url)
37 with self.subTest(url=url, agent=agent):
38 self.assertFalse(self.parser.can_fetch(agent, url))
Tim Peters863ac442002-04-16 01:38:40 +000039
Christopher Beacham5db5c062018-05-16 07:52:07 -070040 def test_site_maps(self):
41 self.assertEqual(self.parser.site_maps(), self.site_maps)
42
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000043
Berker Peksag4da0fd02016-09-11 14:53:16 +030044class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
45 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000046User-agent: *
47Disallow: /cyberworld/map/ # This is an infinite virtual URL space
48Disallow: /tmp/ # these will soon disappear
49Disallow: /foo.html
Berker Peksag4da0fd02016-09-11 14:53:16 +030050 """
51 good = ['/', '/test.html']
52 bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000053
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000054
Berker Peksag4da0fd02016-09-11 14:53:16 +030055class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
56 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000057# robots.txt for http://www.example.com/
58
59User-agent: *
Berker Peksag960e8482015-10-08 12:27:06 +030060Crawl-delay: 1
61Request-rate: 3/15
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000062Disallow: /cyberworld/map/ # This is an infinite virtual URL space
63
64# Cybermapper knows where to go.
65User-agent: cybermapper
66Disallow:
Berker Peksag4da0fd02016-09-11 14:53:16 +030067 """
68 good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
69 bad = ['/cyberworld/map/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000070
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000071
Christopher Beacham5db5c062018-05-16 07:52:07 -070072class SitemapTest(BaseRobotTest, unittest.TestCase):
73 robots_txt = """\
74# robots.txt for http://www.example.com/
75
76User-agent: *
77Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
78Sitemap: http://www.google.com/hostednews/sitemap_index.xml
79Request-rate: 3/15
80Disallow: /cyberworld/map/ # This is an infinite virtual URL space
81
82 """
83 good = ['/', '/test.html']
84 bad = ['/cyberworld/map/index.html']
85 site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
86 'http://www.google.com/hostednews/sitemap_index.xml']
87
88
Berker Peksag4da0fd02016-09-11 14:53:16 +030089class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
90 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000091# go away
92User-agent: *
93Disallow: /
Berker Peksag4da0fd02016-09-11 14:53:16 +030094 """
95 good = []
96 bad = ['/cyberworld/map/index.html', '/', '/tmp/']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000097
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000098
Berker Peksag9a7bbb22016-09-18 20:17:58 +030099class BaseRequestRateTest(BaseRobotTest):
Miss Islington (bot)58a1a762019-06-16 00:07:54 -0700100 request_rate = None
101 crawl_delay = None
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300102
103 def test_request_rate(self):
Miss Islington (bot)58a1a762019-06-16 00:07:54 -0700104 parser = self.parser
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300105 for url in self.good + self.bad:
106 agent, url = self.get_agent_and_url(url)
107 with self.subTest(url=url, agent=agent):
Miss Islington (bot)58a1a762019-06-16 00:07:54 -0700108 self.assertEqual(parser.crawl_delay(agent), self.crawl_delay)
109
110 parsed_request_rate = parser.request_rate(agent)
111 self.assertEqual(parsed_request_rate, self.request_rate)
112 if self.request_rate is not None:
Berker Peksag3df02db2017-11-24 02:40:26 +0300113 self.assertIsInstance(
Miss Islington (bot)58a1a762019-06-16 00:07:54 -0700114 parsed_request_rate,
Berker Peksag3df02db2017-11-24 02:40:26 +0300115 urllib.robotparser.RequestRate
116 )
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300117 self.assertEqual(
Miss Islington (bot)58a1a762019-06-16 00:07:54 -0700118 parsed_request_rate.requests,
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300119 self.request_rate.requests
120 )
121 self.assertEqual(
Miss Islington (bot)58a1a762019-06-16 00:07:54 -0700122 parsed_request_rate.seconds,
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300123 self.request_rate.seconds
124 )
125
126
Miss Islington (bot)58a1a762019-06-16 00:07:54 -0700127class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
128 robots_txt = ''
129 good = ['/foo']
130
131
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300132class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
Berker Peksag4da0fd02016-09-11 14:53:16 +0300133 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000134User-agent: figtree
Berker Peksag960e8482015-10-08 12:27:06 +0300135Crawl-delay: 3
136Request-rate: 9/30
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000137Disallow: /tmp
138Disallow: /a%3cd.html
139Disallow: /a%2fb.html
140Disallow: /%7ejoe/index.html
Berker Peksag4da0fd02016-09-11 14:53:16 +0300141 """
142 agent = 'figtree'
Berker Peksag3df02db2017-11-24 02:40:26 +0300143 request_rate = urllib.robotparser.RequestRate(9, 30)
Berker Peksag4da0fd02016-09-11 14:53:16 +0300144 crawl_delay = 3
145 good = [('figtree', '/foo.html')]
146 bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
147 '/a%2fb.html', '/~joe/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000148
Berker Peksag960e8482015-10-08 12:27:06 +0300149
Berker Peksag4da0fd02016-09-11 14:53:16 +0300150class DifferentAgentTest(CrawlDelayAndRequestRateTest):
151 agent = 'FigTree Robot libwww-perl/5.04'
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000152
Berker Peksag4da0fd02016-09-11 14:53:16 +0300153
154class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
155 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000156User-agent: *
157Disallow: /tmp/
158Disallow: /a%3Cd.html
159Disallow: /a/b.html
160Disallow: /%7ejoe/index.html
Berker Peksag960e8482015-10-08 12:27:06 +0300161Crawl-delay: 3
162Request-rate: 9/banana
Berker Peksag4da0fd02016-09-11 14:53:16 +0300163 """
164 good = ['/tmp']
165 bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
166 '/%7Ejoe/index.html']
167 crawl_delay = 3
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000168
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000169
Berker Peksag4da0fd02016-09-11 14:53:16 +0300170class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
171 # From bug report #523041
172 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000173User-Agent: *
174Disallow: /.
Berker Peksag960e8482015-10-08 12:27:06 +0300175Crawl-delay: pears
Berker Peksag4da0fd02016-09-11 14:53:16 +0300176 """
177 good = ['/foo.html']
178 # bug report says "/" should be denied, but that is not in the RFC
179 bad = []
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000180
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000181
Berker Peksag4da0fd02016-09-11 14:53:16 +0300182class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
183 # also test that Allow and Diasallow works well with each other
184 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000185User-agent: Googlebot
186Allow: /folder1/myfile.html
187Disallow: /folder1/
Berker Peksag960e8482015-10-08 12:27:06 +0300188Request-rate: whale/banana
Berker Peksag4da0fd02016-09-11 14:53:16 +0300189 """
190 agent = 'Googlebot'
191 good = ['/folder1/myfile.html']
192 bad = ['/folder1/anotherfile.html']
Benjamin Petersond6313712008-07-31 16:23:04 +0000193
Benjamin Petersond6313712008-07-31 16:23:04 +0000194
Berker Peksag4da0fd02016-09-11 14:53:16 +0300195class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
196 # the order of User-agent should be correct. note
197 # that this file is incorrect because "Googlebot" is a
198 # substring of "Googlebot-Mobile"
199 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000200User-agent: Googlebot
201Disallow: /
202
203User-agent: Googlebot-Mobile
204Allow: /
Berker Peksag4da0fd02016-09-11 14:53:16 +0300205 """
206 agent = 'Googlebot'
207 bad = ['/something.jpg']
Benjamin Petersond6313712008-07-31 16:23:04 +0000208
209
Berker Peksag4da0fd02016-09-11 14:53:16 +0300210class UserAgentGoogleMobileTest(UserAgentOrderingTest):
211 agent = 'Googlebot-Mobile'
212
213
214class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
215 # Google also got the order wrong. You need
216 # to specify the URLs from more specific to more general
217 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000218User-agent: Googlebot
219Allow: /folder1/myfile.html
220Disallow: /folder1/
Berker Peksag4da0fd02016-09-11 14:53:16 +0300221 """
222 agent = 'googlebot'
223 good = ['/folder1/myfile.html']
224 bad = ['/folder1/anotherfile.html']
Benjamin Petersond6313712008-07-31 16:23:04 +0000225
226
Berker Peksag4da0fd02016-09-11 14:53:16 +0300227class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
228 # see issue #6325 for details
229 robots_txt = """\
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000230User-agent: *
231Disallow: /some/path?name=value
Berker Peksag4da0fd02016-09-11 14:53:16 +0300232 """
233 good = ['/some/path']
234 bad = ['/some/path?name=value']
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000235
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000236
Berker Peksag4da0fd02016-09-11 14:53:16 +0300237class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
238 # obey first * entry (#4108)
239 robots_txt = """\
Georg Brandl0a0fc072010-07-29 17:55:01 +0000240User-agent: *
241Disallow: /some/path
242
243User-agent: *
244Disallow: /another/path
Berker Peksag4da0fd02016-09-11 14:53:16 +0300245 """
246 good = ['/another/path']
247 bad = ['/some/path']
Georg Brandl0a0fc072010-07-29 17:55:01 +0000248
Georg Brandl0a0fc072010-07-29 17:55:01 +0000249
Berker Peksag4da0fd02016-09-11 14:53:16 +0300250class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
251 # normalize the URL first (#17403)
252 robots_txt = """\
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700253User-agent: *
254Allow: /some/path?
255Disallow: /another/path?
Berker Peksag4da0fd02016-09-11 14:53:16 +0300256 """
257 good = ['/some/path?']
258 bad = ['/another/path?']
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700259
Benjamin Petersond6313712008-07-31 16:23:04 +0000260
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300261class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
262 robots_txt = """\
263User-agent: *
264Crawl-delay: 1
265Request-rate: 3/15
266Disallow: /cyberworld/map/
267 """
Berker Peksag3df02db2017-11-24 02:40:26 +0300268 request_rate = urllib.robotparser.RequestRate(3, 15)
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300269 crawl_delay = 1
270 good = ['/', '/test.html']
271 bad = ['/cyberworld/map/index.html']
272
273
Michael Lazarbd08a0a2018-05-14 10:10:41 -0400274class StringFormattingTest(BaseRobotTest, unittest.TestCase):
275 robots_txt = """\
276User-agent: *
277Crawl-delay: 1
278Request-rate: 3/15
279Disallow: /cyberworld/map/ # This is an infinite virtual URL space
280
281# Cybermapper knows where to go.
282User-agent: cybermapper
283Disallow: /some/path
284 """
285
286 expected_output = """\
287User-agent: cybermapper
288Disallow: /some/path
289
290User-agent: *
291Crawl-delay: 1
292Request-rate: 3/15
293Disallow: /cyberworld/map/\
294"""
295
296 def test_string_formatting(self):
297 self.assertEqual(str(self.parser), self.expected_output)
298
299
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700300class RobotHandler(BaseHTTPRequestHandler):
301
302 def do_GET(self):
303 self.send_error(403, "Forbidden access")
304
305 def log_message(self, format, *args):
306 pass
307
308
309class PasswordProtectedSiteTestCase(unittest.TestCase):
310
311 def setUp(self):
312 self.server = HTTPServer((support.HOST, 0), RobotHandler)
313
314 self.t = threading.Thread(
315 name='HTTPServer serving',
316 target=self.server.serve_forever,
317 # Short poll interval to make the test finish quickly.
318 # Time between requests is short enough that we won't wake
319 # up spuriously too many times.
320 kwargs={'poll_interval':0.01})
321 self.t.daemon = True # In case this function raises.
322 self.t.start()
323
324 def tearDown(self):
325 self.server.shutdown()
326 self.t.join()
327 self.server.server_close()
328
Berker Peksag2a9f5ed2016-09-11 15:17:53 +0300329 @support.reap_threads
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000330 def testPasswordProtectedSite(self):
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700331 addr = self.server.server_address
332 url = 'http://' + support.HOST + ':' + str(addr[1])
333 robots_url = url + "/robots.txt"
334 parser = urllib.robotparser.RobotFileParser()
335 parser.set_url(url)
336 parser.read()
337 self.assertFalse(parser.can_fetch("*", robots_url))
338
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700339
340class NetworkTestCase(unittest.TestCase):
Thomas Wouters47b49bf2007-08-30 22:15:33 +0000341
Berker Peksag2a8d7f12016-09-18 11:21:57 +0300342 base_url = 'http://www.pythontest.net/'
343 robots_txt = '{}elsewhere/robots.txt'.format(base_url)
344
345 @classmethod
346 def setUpClass(cls):
Florent Xicluna41fe6152010-04-02 18:52:12 +0000347 support.requires('network')
Berker Peksag2a8d7f12016-09-18 11:21:57 +0300348 with support.transient_internet(cls.base_url):
349 cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt)
350 cls.parser.read()
351
352 def url(self, path):
353 return '{}{}{}'.format(
354 self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
355 )
356
357 def test_basic(self):
358 self.assertFalse(self.parser.disallow_all)
359 self.assertFalse(self.parser.allow_all)
360 self.assertGreater(self.parser.mtime(), 0)
361 self.assertFalse(self.parser.crawl_delay('*'))
362 self.assertFalse(self.parser.request_rate('*'))
363
364 def test_can_fetch(self):
365 self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
366 self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
367 self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
368 self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
369 self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
370 self.assertTrue(self.parser.can_fetch('*', self.base_url))
371
372 def test_read_404(self):
373 parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt'))
374 parser.read()
375 self.assertTrue(parser.allow_all)
376 self.assertFalse(parser.disallow_all)
377 self.assertEqual(parser.mtime(), 0)
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300378 self.assertIsNone(parser.crawl_delay('*'))
379 self.assertIsNone(parser.request_rate('*'))
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000380
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000381if __name__=='__main__':
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200382 unittest.main()