blob: 5c1a571f1b6d70291b777899a734ae89eb9fb6b5 [file] [log] [blame]
Guido van Rossum34d19282007-08-09 01:03:29 +00001import io
Berker Peksag2a8d7f12016-09-18 11:21:57 +03002import os
Antoine Pitroua6a4dc82017-09-07 18:56:24 +02003import threading
Jeremy Hylton1afc1692008-06-18 20:49:58 +00004import unittest
5import urllib.robotparser
Berker Peksag960e8482015-10-08 12:27:06 +03006from collections import namedtuple
Benjamin Petersonee8712c2008-05-20 21:35:26 +00007from test import support
Senthil Kumaran601d6ec2014-06-25 02:58:15 -07008from http.server import BaseHTTPRequestHandler, HTTPServer
9
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000010
Berker Peksag4da0fd02016-09-11 14:53:16 +030011class BaseRobotTest:
12 robots_txt = ''
13 agent = 'test_robotparser'
14 good = []
15 bad = []
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000016
Berker Peksag4da0fd02016-09-11 14:53:16 +030017 def setUp(self):
18 lines = io.StringIO(self.robots_txt).readlines()
19 self.parser = urllib.robotparser.RobotFileParser()
20 self.parser.parse(lines)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000021
Berker Peksag4da0fd02016-09-11 14:53:16 +030022 def get_agent_and_url(self, url):
23 if isinstance(url, tuple):
24 agent, url = url
25 return agent, url
26 return self.agent, url
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000027
Berker Peksag4da0fd02016-09-11 14:53:16 +030028 def test_good_urls(self):
29 for url in self.good:
30 agent, url = self.get_agent_and_url(url)
31 with self.subTest(url=url, agent=agent):
32 self.assertTrue(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000033
Berker Peksag4da0fd02016-09-11 14:53:16 +030034 def test_bad_urls(self):
35 for url in self.bad:
36 agent, url = self.get_agent_and_url(url)
37 with self.subTest(url=url, agent=agent):
38 self.assertFalse(self.parser.can_fetch(agent, url))
Tim Peters863ac442002-04-16 01:38:40 +000039
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000040
Berker Peksag4da0fd02016-09-11 14:53:16 +030041class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
42 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000043User-agent: *
44Disallow: /cyberworld/map/ # This is an infinite virtual URL space
45Disallow: /tmp/ # these will soon disappear
46Disallow: /foo.html
Berker Peksag4da0fd02016-09-11 14:53:16 +030047 """
48 good = ['/', '/test.html']
49 bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000050
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000051
Berker Peksag4da0fd02016-09-11 14:53:16 +030052class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
53 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000054# robots.txt for http://www.example.com/
55
56User-agent: *
Berker Peksag960e8482015-10-08 12:27:06 +030057Crawl-delay: 1
58Request-rate: 3/15
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000059Disallow: /cyberworld/map/ # This is an infinite virtual URL space
60
61# Cybermapper knows where to go.
62User-agent: cybermapper
63Disallow:
Berker Peksag4da0fd02016-09-11 14:53:16 +030064 """
65 good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
66 bad = ['/cyberworld/map/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000067
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000068
Berker Peksag4da0fd02016-09-11 14:53:16 +030069class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
70 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000071# go away
72User-agent: *
73Disallow: /
Berker Peksag4da0fd02016-09-11 14:53:16 +030074 """
75 good = []
76 bad = ['/cyberworld/map/index.html', '/', '/tmp/']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000077
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000078
Berker Peksag9a7bbb22016-09-18 20:17:58 +030079class BaseRequestRateTest(BaseRobotTest):
80
81 def test_request_rate(self):
82 for url in self.good + self.bad:
83 agent, url = self.get_agent_and_url(url)
84 with self.subTest(url=url, agent=agent):
85 if self.crawl_delay:
86 self.assertEqual(
87 self.parser.crawl_delay(agent), self.crawl_delay
88 )
89 if self.request_rate:
90 self.assertEqual(
91 self.parser.request_rate(agent).requests,
92 self.request_rate.requests
93 )
94 self.assertEqual(
95 self.parser.request_rate(agent).seconds,
96 self.request_rate.seconds
97 )
98
99
100class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
Berker Peksag4da0fd02016-09-11 14:53:16 +0300101 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000102User-agent: figtree
Berker Peksag960e8482015-10-08 12:27:06 +0300103Crawl-delay: 3
104Request-rate: 9/30
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000105Disallow: /tmp
106Disallow: /a%3cd.html
107Disallow: /a%2fb.html
108Disallow: /%7ejoe/index.html
Berker Peksag4da0fd02016-09-11 14:53:16 +0300109 """
110 agent = 'figtree'
111 request_rate = namedtuple('req_rate', 'requests seconds')(9, 30)
112 crawl_delay = 3
113 good = [('figtree', '/foo.html')]
114 bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
115 '/a%2fb.html', '/~joe/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000116
Berker Peksag960e8482015-10-08 12:27:06 +0300117
Berker Peksag4da0fd02016-09-11 14:53:16 +0300118class DifferentAgentTest(CrawlDelayAndRequestRateTest):
119 agent = 'FigTree Robot libwww-perl/5.04'
120 # these are not actually tested, but we still need to parse it
121 # in order to accommodate the input parameters
122 request_rate = None
123 crawl_delay = None
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000124
Berker Peksag4da0fd02016-09-11 14:53:16 +0300125
126class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
127 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000128User-agent: *
129Disallow: /tmp/
130Disallow: /a%3Cd.html
131Disallow: /a/b.html
132Disallow: /%7ejoe/index.html
Berker Peksag960e8482015-10-08 12:27:06 +0300133Crawl-delay: 3
134Request-rate: 9/banana
Berker Peksag4da0fd02016-09-11 14:53:16 +0300135 """
136 good = ['/tmp']
137 bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
138 '/%7Ejoe/index.html']
139 crawl_delay = 3
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000140
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000141
Berker Peksag4da0fd02016-09-11 14:53:16 +0300142class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
143 # From bug report #523041
144 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000145User-Agent: *
146Disallow: /.
Berker Peksag960e8482015-10-08 12:27:06 +0300147Crawl-delay: pears
Berker Peksag4da0fd02016-09-11 14:53:16 +0300148 """
149 good = ['/foo.html']
150 # bug report says "/" should be denied, but that is not in the RFC
151 bad = []
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000152
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000153
Berker Peksag4da0fd02016-09-11 14:53:16 +0300154class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
155 # also test that Allow and Diasallow works well with each other
156 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000157User-agent: Googlebot
158Allow: /folder1/myfile.html
159Disallow: /folder1/
Berker Peksag960e8482015-10-08 12:27:06 +0300160Request-rate: whale/banana
Berker Peksag4da0fd02016-09-11 14:53:16 +0300161 """
162 agent = 'Googlebot'
163 good = ['/folder1/myfile.html']
164 bad = ['/folder1/anotherfile.html']
Benjamin Petersond6313712008-07-31 16:23:04 +0000165
Benjamin Petersond6313712008-07-31 16:23:04 +0000166
Berker Peksag4da0fd02016-09-11 14:53:16 +0300167class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
168 # the order of User-agent should be correct. note
169 # that this file is incorrect because "Googlebot" is a
170 # substring of "Googlebot-Mobile"
171 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000172User-agent: Googlebot
173Disallow: /
174
175User-agent: Googlebot-Mobile
176Allow: /
Berker Peksag4da0fd02016-09-11 14:53:16 +0300177 """
178 agent = 'Googlebot'
179 bad = ['/something.jpg']
Benjamin Petersond6313712008-07-31 16:23:04 +0000180
181
Berker Peksag4da0fd02016-09-11 14:53:16 +0300182class UserAgentGoogleMobileTest(UserAgentOrderingTest):
183 agent = 'Googlebot-Mobile'
184
185
186class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
187 # Google also got the order wrong. You need
188 # to specify the URLs from more specific to more general
189 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000190User-agent: Googlebot
191Allow: /folder1/myfile.html
192Disallow: /folder1/
Berker Peksag4da0fd02016-09-11 14:53:16 +0300193 """
194 agent = 'googlebot'
195 good = ['/folder1/myfile.html']
196 bad = ['/folder1/anotherfile.html']
Benjamin Petersond6313712008-07-31 16:23:04 +0000197
198
Berker Peksag4da0fd02016-09-11 14:53:16 +0300199class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
200 # see issue #6325 for details
201 robots_txt = """\
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000202User-agent: *
203Disallow: /some/path?name=value
Berker Peksag4da0fd02016-09-11 14:53:16 +0300204 """
205 good = ['/some/path']
206 bad = ['/some/path?name=value']
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000207
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000208
Berker Peksag4da0fd02016-09-11 14:53:16 +0300209class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
210 # obey first * entry (#4108)
211 robots_txt = """\
Georg Brandl0a0fc072010-07-29 17:55:01 +0000212User-agent: *
213Disallow: /some/path
214
215User-agent: *
216Disallow: /another/path
Berker Peksag4da0fd02016-09-11 14:53:16 +0300217 """
218 good = ['/another/path']
219 bad = ['/some/path']
Georg Brandl0a0fc072010-07-29 17:55:01 +0000220
Georg Brandl0a0fc072010-07-29 17:55:01 +0000221
Berker Peksag4da0fd02016-09-11 14:53:16 +0300222class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
223 # normalize the URL first (#17403)
224 robots_txt = """\
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700225User-agent: *
226Allow: /some/path?
227Disallow: /another/path?
Berker Peksag4da0fd02016-09-11 14:53:16 +0300228 """
229 good = ['/some/path?']
230 bad = ['/another/path?']
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700231
Benjamin Petersond6313712008-07-31 16:23:04 +0000232
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300233class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
234 robots_txt = """\
235User-agent: *
236Crawl-delay: 1
237Request-rate: 3/15
238Disallow: /cyberworld/map/
239 """
240 request_rate = namedtuple('req_rate', 'requests seconds')(3, 15)
241 crawl_delay = 1
242 good = ['/', '/test.html']
243 bad = ['/cyberworld/map/index.html']
244
245
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700246class RobotHandler(BaseHTTPRequestHandler):
247
248 def do_GET(self):
249 self.send_error(403, "Forbidden access")
250
251 def log_message(self, format, *args):
252 pass
253
254
255class PasswordProtectedSiteTestCase(unittest.TestCase):
256
257 def setUp(self):
258 self.server = HTTPServer((support.HOST, 0), RobotHandler)
259
260 self.t = threading.Thread(
261 name='HTTPServer serving',
262 target=self.server.serve_forever,
263 # Short poll interval to make the test finish quickly.
264 # Time between requests is short enough that we won't wake
265 # up spuriously too many times.
266 kwargs={'poll_interval':0.01})
267 self.t.daemon = True # In case this function raises.
268 self.t.start()
269
270 def tearDown(self):
271 self.server.shutdown()
272 self.t.join()
273 self.server.server_close()
274
Berker Peksag2a9f5ed2016-09-11 15:17:53 +0300275 @support.reap_threads
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000276 def testPasswordProtectedSite(self):
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700277 addr = self.server.server_address
278 url = 'http://' + support.HOST + ':' + str(addr[1])
279 robots_url = url + "/robots.txt"
280 parser = urllib.robotparser.RobotFileParser()
281 parser.set_url(url)
282 parser.read()
283 self.assertFalse(parser.can_fetch("*", robots_url))
284
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700285
286class NetworkTestCase(unittest.TestCase):
Thomas Wouters47b49bf2007-08-30 22:15:33 +0000287
Berker Peksag2a8d7f12016-09-18 11:21:57 +0300288 base_url = 'http://www.pythontest.net/'
289 robots_txt = '{}elsewhere/robots.txt'.format(base_url)
290
291 @classmethod
292 def setUpClass(cls):
Florent Xicluna41fe6152010-04-02 18:52:12 +0000293 support.requires('network')
Berker Peksag2a8d7f12016-09-18 11:21:57 +0300294 with support.transient_internet(cls.base_url):
295 cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt)
296 cls.parser.read()
297
298 def url(self, path):
299 return '{}{}{}'.format(
300 self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
301 )
302
303 def test_basic(self):
304 self.assertFalse(self.parser.disallow_all)
305 self.assertFalse(self.parser.allow_all)
306 self.assertGreater(self.parser.mtime(), 0)
307 self.assertFalse(self.parser.crawl_delay('*'))
308 self.assertFalse(self.parser.request_rate('*'))
309
310 def test_can_fetch(self):
311 self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
312 self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
313 self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
314 self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
315 self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
316 self.assertTrue(self.parser.can_fetch('*', self.base_url))
317
318 def test_read_404(self):
319 parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt'))
320 parser.read()
321 self.assertTrue(parser.allow_all)
322 self.assertFalse(parser.disallow_all)
323 self.assertEqual(parser.mtime(), 0)
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300324 self.assertIsNone(parser.crawl_delay('*'))
325 self.assertIsNone(parser.request_rate('*'))
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000326
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000327if __name__=='__main__':
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200328 unittest.main()