blob: 75198b70ad4ff5fc04eaf2d56860cb3abb6ca258 [file] [log] [blame]
Guido van Rossum34d19282007-08-09 01:03:29 +00001import io
Berker Peksag2a8d7f12016-09-18 11:21:57 +03002import os
Antoine Pitroua6a4dc82017-09-07 18:56:24 +02003import threading
Jeremy Hylton1afc1692008-06-18 20:49:58 +00004import unittest
5import urllib.robotparser
Benjamin Petersonee8712c2008-05-20 21:35:26 +00006from test import support
Senthil Kumaran601d6ec2014-06-25 02:58:15 -07007from http.server import BaseHTTPRequestHandler, HTTPServer
8
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +00009
Berker Peksag4da0fd02016-09-11 14:53:16 +030010class BaseRobotTest:
11 robots_txt = ''
12 agent = 'test_robotparser'
13 good = []
14 bad = []
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000015
Berker Peksag4da0fd02016-09-11 14:53:16 +030016 def setUp(self):
17 lines = io.StringIO(self.robots_txt).readlines()
18 self.parser = urllib.robotparser.RobotFileParser()
19 self.parser.parse(lines)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000020
Berker Peksag4da0fd02016-09-11 14:53:16 +030021 def get_agent_and_url(self, url):
22 if isinstance(url, tuple):
23 agent, url = url
24 return agent, url
25 return self.agent, url
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000026
Berker Peksag4da0fd02016-09-11 14:53:16 +030027 def test_good_urls(self):
28 for url in self.good:
29 agent, url = self.get_agent_and_url(url)
30 with self.subTest(url=url, agent=agent):
31 self.assertTrue(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000032
Berker Peksag4da0fd02016-09-11 14:53:16 +030033 def test_bad_urls(self):
34 for url in self.bad:
35 agent, url = self.get_agent_and_url(url)
36 with self.subTest(url=url, agent=agent):
37 self.assertFalse(self.parser.can_fetch(agent, url))
Tim Peters863ac442002-04-16 01:38:40 +000038
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000039
Berker Peksag4da0fd02016-09-11 14:53:16 +030040class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
41 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000042User-agent: *
43Disallow: /cyberworld/map/ # This is an infinite virtual URL space
44Disallow: /tmp/ # these will soon disappear
45Disallow: /foo.html
Berker Peksag4da0fd02016-09-11 14:53:16 +030046 """
47 good = ['/', '/test.html']
48 bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000049
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000050
Berker Peksag4da0fd02016-09-11 14:53:16 +030051class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
52 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000053# robots.txt for http://www.example.com/
54
55User-agent: *
Berker Peksag960e8482015-10-08 12:27:06 +030056Crawl-delay: 1
57Request-rate: 3/15
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000058Disallow: /cyberworld/map/ # This is an infinite virtual URL space
59
60# Cybermapper knows where to go.
61User-agent: cybermapper
62Disallow:
Berker Peksag4da0fd02016-09-11 14:53:16 +030063 """
64 good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
65 bad = ['/cyberworld/map/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000066
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000067
Berker Peksag4da0fd02016-09-11 14:53:16 +030068class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
69 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000070# go away
71User-agent: *
72Disallow: /
Berker Peksag4da0fd02016-09-11 14:53:16 +030073 """
74 good = []
75 bad = ['/cyberworld/map/index.html', '/', '/tmp/']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000076
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000077
Berker Peksag9a7bbb22016-09-18 20:17:58 +030078class BaseRequestRateTest(BaseRobotTest):
79
80 def test_request_rate(self):
81 for url in self.good + self.bad:
82 agent, url = self.get_agent_and_url(url)
83 with self.subTest(url=url, agent=agent):
84 if self.crawl_delay:
85 self.assertEqual(
86 self.parser.crawl_delay(agent), self.crawl_delay
87 )
88 if self.request_rate:
Berker Peksag3df02db2017-11-24 02:40:26 +030089 self.assertIsInstance(
90 self.parser.request_rate(agent),
91 urllib.robotparser.RequestRate
92 )
Berker Peksag9a7bbb22016-09-18 20:17:58 +030093 self.assertEqual(
94 self.parser.request_rate(agent).requests,
95 self.request_rate.requests
96 )
97 self.assertEqual(
98 self.parser.request_rate(agent).seconds,
99 self.request_rate.seconds
100 )
101
102
103class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
Berker Peksag4da0fd02016-09-11 14:53:16 +0300104 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000105User-agent: figtree
Berker Peksag960e8482015-10-08 12:27:06 +0300106Crawl-delay: 3
107Request-rate: 9/30
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000108Disallow: /tmp
109Disallow: /a%3cd.html
110Disallow: /a%2fb.html
111Disallow: /%7ejoe/index.html
Berker Peksag4da0fd02016-09-11 14:53:16 +0300112 """
113 agent = 'figtree'
Berker Peksag3df02db2017-11-24 02:40:26 +0300114 request_rate = urllib.robotparser.RequestRate(9, 30)
Berker Peksag4da0fd02016-09-11 14:53:16 +0300115 crawl_delay = 3
116 good = [('figtree', '/foo.html')]
117 bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
118 '/a%2fb.html', '/~joe/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000119
Berker Peksag960e8482015-10-08 12:27:06 +0300120
Berker Peksag4da0fd02016-09-11 14:53:16 +0300121class DifferentAgentTest(CrawlDelayAndRequestRateTest):
122 agent = 'FigTree Robot libwww-perl/5.04'
123 # these are not actually tested, but we still need to parse it
124 # in order to accommodate the input parameters
125 request_rate = None
126 crawl_delay = None
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000127
Berker Peksag4da0fd02016-09-11 14:53:16 +0300128
129class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
130 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000131User-agent: *
132Disallow: /tmp/
133Disallow: /a%3Cd.html
134Disallow: /a/b.html
135Disallow: /%7ejoe/index.html
Berker Peksag960e8482015-10-08 12:27:06 +0300136Crawl-delay: 3
137Request-rate: 9/banana
Berker Peksag4da0fd02016-09-11 14:53:16 +0300138 """
139 good = ['/tmp']
140 bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
141 '/%7Ejoe/index.html']
142 crawl_delay = 3
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000143
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000144
Berker Peksag4da0fd02016-09-11 14:53:16 +0300145class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
146 # From bug report #523041
147 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000148User-Agent: *
149Disallow: /.
Berker Peksag960e8482015-10-08 12:27:06 +0300150Crawl-delay: pears
Berker Peksag4da0fd02016-09-11 14:53:16 +0300151 """
152 good = ['/foo.html']
153 # bug report says "/" should be denied, but that is not in the RFC
154 bad = []
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000155
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000156
Berker Peksag4da0fd02016-09-11 14:53:16 +0300157class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
158 # also test that Allow and Diasallow works well with each other
159 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000160User-agent: Googlebot
161Allow: /folder1/myfile.html
162Disallow: /folder1/
Berker Peksag960e8482015-10-08 12:27:06 +0300163Request-rate: whale/banana
Berker Peksag4da0fd02016-09-11 14:53:16 +0300164 """
165 agent = 'Googlebot'
166 good = ['/folder1/myfile.html']
167 bad = ['/folder1/anotherfile.html']
Benjamin Petersond6313712008-07-31 16:23:04 +0000168
Benjamin Petersond6313712008-07-31 16:23:04 +0000169
Berker Peksag4da0fd02016-09-11 14:53:16 +0300170class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
171 # the order of User-agent should be correct. note
172 # that this file is incorrect because "Googlebot" is a
173 # substring of "Googlebot-Mobile"
174 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000175User-agent: Googlebot
176Disallow: /
177
178User-agent: Googlebot-Mobile
179Allow: /
Berker Peksag4da0fd02016-09-11 14:53:16 +0300180 """
181 agent = 'Googlebot'
182 bad = ['/something.jpg']
Benjamin Petersond6313712008-07-31 16:23:04 +0000183
184
Berker Peksag4da0fd02016-09-11 14:53:16 +0300185class UserAgentGoogleMobileTest(UserAgentOrderingTest):
186 agent = 'Googlebot-Mobile'
187
188
189class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
190 # Google also got the order wrong. You need
191 # to specify the URLs from more specific to more general
192 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000193User-agent: Googlebot
194Allow: /folder1/myfile.html
195Disallow: /folder1/
Berker Peksag4da0fd02016-09-11 14:53:16 +0300196 """
197 agent = 'googlebot'
198 good = ['/folder1/myfile.html']
199 bad = ['/folder1/anotherfile.html']
Benjamin Petersond6313712008-07-31 16:23:04 +0000200
201
Berker Peksag4da0fd02016-09-11 14:53:16 +0300202class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
203 # see issue #6325 for details
204 robots_txt = """\
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000205User-agent: *
206Disallow: /some/path?name=value
Berker Peksag4da0fd02016-09-11 14:53:16 +0300207 """
208 good = ['/some/path']
209 bad = ['/some/path?name=value']
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000210
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000211
Berker Peksag4da0fd02016-09-11 14:53:16 +0300212class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
213 # obey first * entry (#4108)
214 robots_txt = """\
Georg Brandl0a0fc072010-07-29 17:55:01 +0000215User-agent: *
216Disallow: /some/path
217
218User-agent: *
219Disallow: /another/path
Berker Peksag4da0fd02016-09-11 14:53:16 +0300220 """
221 good = ['/another/path']
222 bad = ['/some/path']
Georg Brandl0a0fc072010-07-29 17:55:01 +0000223
Georg Brandl0a0fc072010-07-29 17:55:01 +0000224
Berker Peksag4da0fd02016-09-11 14:53:16 +0300225class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
226 # normalize the URL first (#17403)
227 robots_txt = """\
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700228User-agent: *
229Allow: /some/path?
230Disallow: /another/path?
Berker Peksag4da0fd02016-09-11 14:53:16 +0300231 """
232 good = ['/some/path?']
233 bad = ['/another/path?']
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700234
Benjamin Petersond6313712008-07-31 16:23:04 +0000235
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300236class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
237 robots_txt = """\
238User-agent: *
239Crawl-delay: 1
240Request-rate: 3/15
241Disallow: /cyberworld/map/
242 """
Berker Peksag3df02db2017-11-24 02:40:26 +0300243 request_rate = urllib.robotparser.RequestRate(3, 15)
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300244 crawl_delay = 1
245 good = ['/', '/test.html']
246 bad = ['/cyberworld/map/index.html']
247
248
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700249class RobotHandler(BaseHTTPRequestHandler):
250
251 def do_GET(self):
252 self.send_error(403, "Forbidden access")
253
254 def log_message(self, format, *args):
255 pass
256
257
258class PasswordProtectedSiteTestCase(unittest.TestCase):
259
260 def setUp(self):
261 self.server = HTTPServer((support.HOST, 0), RobotHandler)
262
263 self.t = threading.Thread(
264 name='HTTPServer serving',
265 target=self.server.serve_forever,
266 # Short poll interval to make the test finish quickly.
267 # Time between requests is short enough that we won't wake
268 # up spuriously too many times.
269 kwargs={'poll_interval':0.01})
270 self.t.daemon = True # In case this function raises.
271 self.t.start()
272
273 def tearDown(self):
274 self.server.shutdown()
275 self.t.join()
276 self.server.server_close()
277
Berker Peksag2a9f5ed2016-09-11 15:17:53 +0300278 @support.reap_threads
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000279 def testPasswordProtectedSite(self):
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700280 addr = self.server.server_address
281 url = 'http://' + support.HOST + ':' + str(addr[1])
282 robots_url = url + "/robots.txt"
283 parser = urllib.robotparser.RobotFileParser()
284 parser.set_url(url)
285 parser.read()
286 self.assertFalse(parser.can_fetch("*", robots_url))
287
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700288
289class NetworkTestCase(unittest.TestCase):
Thomas Wouters47b49bf2007-08-30 22:15:33 +0000290
Berker Peksag2a8d7f12016-09-18 11:21:57 +0300291 base_url = 'http://www.pythontest.net/'
292 robots_txt = '{}elsewhere/robots.txt'.format(base_url)
293
294 @classmethod
295 def setUpClass(cls):
Florent Xicluna41fe6152010-04-02 18:52:12 +0000296 support.requires('network')
Berker Peksag2a8d7f12016-09-18 11:21:57 +0300297 with support.transient_internet(cls.base_url):
298 cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt)
299 cls.parser.read()
300
301 def url(self, path):
302 return '{}{}{}'.format(
303 self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
304 )
305
306 def test_basic(self):
307 self.assertFalse(self.parser.disallow_all)
308 self.assertFalse(self.parser.allow_all)
309 self.assertGreater(self.parser.mtime(), 0)
310 self.assertFalse(self.parser.crawl_delay('*'))
311 self.assertFalse(self.parser.request_rate('*'))
312
313 def test_can_fetch(self):
314 self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
315 self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
316 self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
317 self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
318 self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
319 self.assertTrue(self.parser.can_fetch('*', self.base_url))
320
321 def test_read_404(self):
322 parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt'))
323 parser.read()
324 self.assertTrue(parser.allow_all)
325 self.assertFalse(parser.disallow_all)
326 self.assertEqual(parser.mtime(), 0)
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300327 self.assertIsNone(parser.crawl_delay('*'))
328 self.assertIsNone(parser.request_rate('*'))
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000329
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000330if __name__=='__main__':
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200331 unittest.main()