blob: 0f64ba8b060defbbc129423f063f18d4aa2ac498 [file] [log] [blame]
Guido van Rossum34d19282007-08-09 01:03:29 +00001import io
Berker Peksag2a8d7f12016-09-18 11:21:57 +03002import os
Jeremy Hylton1afc1692008-06-18 20:49:58 +00003import unittest
4import urllib.robotparser
Berker Peksag960e8482015-10-08 12:27:06 +03005from collections import namedtuple
Benjamin Petersonee8712c2008-05-20 21:35:26 +00006from test import support
Senthil Kumaran601d6ec2014-06-25 02:58:15 -07007from http.server import BaseHTTPRequestHandler, HTTPServer
Berker Peksagad324f62014-06-29 15:54:56 +03008try:
9 import threading
10except ImportError:
11 threading = None
Senthil Kumaran601d6ec2014-06-25 02:58:15 -070012
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000013
Berker Peksag4da0fd02016-09-11 14:53:16 +030014class BaseRobotTest:
15 robots_txt = ''
16 agent = 'test_robotparser'
17 good = []
18 bad = []
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000019
Berker Peksag4da0fd02016-09-11 14:53:16 +030020 def setUp(self):
21 lines = io.StringIO(self.robots_txt).readlines()
22 self.parser = urllib.robotparser.RobotFileParser()
23 self.parser.parse(lines)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000024
Berker Peksag4da0fd02016-09-11 14:53:16 +030025 def get_agent_and_url(self, url):
26 if isinstance(url, tuple):
27 agent, url = url
28 return agent, url
29 return self.agent, url
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000030
Berker Peksag4da0fd02016-09-11 14:53:16 +030031 def test_good_urls(self):
32 for url in self.good:
33 agent, url = self.get_agent_and_url(url)
34 with self.subTest(url=url, agent=agent):
35 self.assertTrue(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000036
Berker Peksag4da0fd02016-09-11 14:53:16 +030037 def test_bad_urls(self):
38 for url in self.bad:
39 agent, url = self.get_agent_and_url(url)
40 with self.subTest(url=url, agent=agent):
41 self.assertFalse(self.parser.can_fetch(agent, url))
Tim Peters863ac442002-04-16 01:38:40 +000042
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000043
Berker Peksag4da0fd02016-09-11 14:53:16 +030044class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
45 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000046User-agent: *
47Disallow: /cyberworld/map/ # This is an infinite virtual URL space
48Disallow: /tmp/ # these will soon disappear
49Disallow: /foo.html
Berker Peksag4da0fd02016-09-11 14:53:16 +030050 """
51 good = ['/', '/test.html']
52 bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000053
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000054
Berker Peksag4da0fd02016-09-11 14:53:16 +030055class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
56 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000057# robots.txt for http://www.example.com/
58
59User-agent: *
Berker Peksag960e8482015-10-08 12:27:06 +030060Crawl-delay: 1
61Request-rate: 3/15
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000062Disallow: /cyberworld/map/ # This is an infinite virtual URL space
63
64# Cybermapper knows where to go.
65User-agent: cybermapper
66Disallow:
Berker Peksag4da0fd02016-09-11 14:53:16 +030067 """
68 good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
69 bad = ['/cyberworld/map/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000070
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000071
Berker Peksag4da0fd02016-09-11 14:53:16 +030072class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
73 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000074# go away
75User-agent: *
76Disallow: /
Berker Peksag4da0fd02016-09-11 14:53:16 +030077 """
78 good = []
79 bad = ['/cyberworld/map/index.html', '/', '/tmp/']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000080
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000081
Berker Peksag9a7bbb22016-09-18 20:17:58 +030082class BaseRequestRateTest(BaseRobotTest):
83
84 def test_request_rate(self):
85 for url in self.good + self.bad:
86 agent, url = self.get_agent_and_url(url)
87 with self.subTest(url=url, agent=agent):
88 if self.crawl_delay:
89 self.assertEqual(
90 self.parser.crawl_delay(agent), self.crawl_delay
91 )
92 if self.request_rate:
93 self.assertEqual(
94 self.parser.request_rate(agent).requests,
95 self.request_rate.requests
96 )
97 self.assertEqual(
98 self.parser.request_rate(agent).seconds,
99 self.request_rate.seconds
100 )
101
102
103class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
Berker Peksag4da0fd02016-09-11 14:53:16 +0300104 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000105User-agent: figtree
Berker Peksag960e8482015-10-08 12:27:06 +0300106Crawl-delay: 3
107Request-rate: 9/30
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000108Disallow: /tmp
109Disallow: /a%3cd.html
110Disallow: /a%2fb.html
111Disallow: /%7ejoe/index.html
Berker Peksag4da0fd02016-09-11 14:53:16 +0300112 """
113 agent = 'figtree'
114 request_rate = namedtuple('req_rate', 'requests seconds')(9, 30)
115 crawl_delay = 3
116 good = [('figtree', '/foo.html')]
117 bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
118 '/a%2fb.html', '/~joe/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000119
Berker Peksag960e8482015-10-08 12:27:06 +0300120
Berker Peksag4da0fd02016-09-11 14:53:16 +0300121class DifferentAgentTest(CrawlDelayAndRequestRateTest):
122 agent = 'FigTree Robot libwww-perl/5.04'
123 # these are not actually tested, but we still need to parse it
124 # in order to accommodate the input parameters
125 request_rate = None
126 crawl_delay = None
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000127
Berker Peksag4da0fd02016-09-11 14:53:16 +0300128
129class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
130 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000131User-agent: *
132Disallow: /tmp/
133Disallow: /a%3Cd.html
134Disallow: /a/b.html
135Disallow: /%7ejoe/index.html
Berker Peksag960e8482015-10-08 12:27:06 +0300136Crawl-delay: 3
137Request-rate: 9/banana
Berker Peksag4da0fd02016-09-11 14:53:16 +0300138 """
139 good = ['/tmp']
140 bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
141 '/%7Ejoe/index.html']
142 crawl_delay = 3
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000143
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000144
Berker Peksag4da0fd02016-09-11 14:53:16 +0300145class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
146 # From bug report #523041
147 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000148User-Agent: *
149Disallow: /.
Berker Peksag960e8482015-10-08 12:27:06 +0300150Crawl-delay: pears
Berker Peksag4da0fd02016-09-11 14:53:16 +0300151 """
152 good = ['/foo.html']
153 # bug report says "/" should be denied, but that is not in the RFC
154 bad = []
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000155
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000156
Berker Peksag4da0fd02016-09-11 14:53:16 +0300157class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
158 # also test that Allow and Diasallow works well with each other
159 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000160User-agent: Googlebot
161Allow: /folder1/myfile.html
162Disallow: /folder1/
Berker Peksag960e8482015-10-08 12:27:06 +0300163Request-rate: whale/banana
Berker Peksag4da0fd02016-09-11 14:53:16 +0300164 """
165 agent = 'Googlebot'
166 good = ['/folder1/myfile.html']
167 bad = ['/folder1/anotherfile.html']
Benjamin Petersond6313712008-07-31 16:23:04 +0000168
Benjamin Petersond6313712008-07-31 16:23:04 +0000169
Berker Peksag4da0fd02016-09-11 14:53:16 +0300170class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
171 # the order of User-agent should be correct. note
172 # that this file is incorrect because "Googlebot" is a
173 # substring of "Googlebot-Mobile"
174 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000175User-agent: Googlebot
176Disallow: /
177
178User-agent: Googlebot-Mobile
179Allow: /
Berker Peksag4da0fd02016-09-11 14:53:16 +0300180 """
181 agent = 'Googlebot'
182 bad = ['/something.jpg']
Benjamin Petersond6313712008-07-31 16:23:04 +0000183
184
Berker Peksag4da0fd02016-09-11 14:53:16 +0300185class UserAgentGoogleMobileTest(UserAgentOrderingTest):
186 agent = 'Googlebot-Mobile'
187
188
189class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
190 # Google also got the order wrong. You need
191 # to specify the URLs from more specific to more general
192 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000193User-agent: Googlebot
194Allow: /folder1/myfile.html
195Disallow: /folder1/
Berker Peksag4da0fd02016-09-11 14:53:16 +0300196 """
197 agent = 'googlebot'
198 good = ['/folder1/myfile.html']
199 bad = ['/folder1/anotherfile.html']
Benjamin Petersond6313712008-07-31 16:23:04 +0000200
201
Berker Peksag4da0fd02016-09-11 14:53:16 +0300202class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
203 # see issue #6325 for details
204 robots_txt = """\
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000205User-agent: *
206Disallow: /some/path?name=value
Berker Peksag4da0fd02016-09-11 14:53:16 +0300207 """
208 good = ['/some/path']
209 bad = ['/some/path?name=value']
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000210
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000211
Berker Peksag4da0fd02016-09-11 14:53:16 +0300212class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
213 # obey first * entry (#4108)
214 robots_txt = """\
Georg Brandl0a0fc072010-07-29 17:55:01 +0000215User-agent: *
216Disallow: /some/path
217
218User-agent: *
219Disallow: /another/path
Berker Peksag4da0fd02016-09-11 14:53:16 +0300220 """
221 good = ['/another/path']
222 bad = ['/some/path']
Georg Brandl0a0fc072010-07-29 17:55:01 +0000223
Georg Brandl0a0fc072010-07-29 17:55:01 +0000224
Berker Peksag4da0fd02016-09-11 14:53:16 +0300225class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
226 # normalize the URL first (#17403)
227 robots_txt = """\
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700228User-agent: *
229Allow: /some/path?
230Disallow: /another/path?
Berker Peksag4da0fd02016-09-11 14:53:16 +0300231 """
232 good = ['/some/path?']
233 bad = ['/another/path?']
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700234
Benjamin Petersond6313712008-07-31 16:23:04 +0000235
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300236class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
237 robots_txt = """\
238User-agent: *
239Crawl-delay: 1
240Request-rate: 3/15
241Disallow: /cyberworld/map/
242 """
243 request_rate = namedtuple('req_rate', 'requests seconds')(3, 15)
244 crawl_delay = 1
245 good = ['/', '/test.html']
246 bad = ['/cyberworld/map/index.html']
247
248
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700249class RobotHandler(BaseHTTPRequestHandler):
250
251 def do_GET(self):
252 self.send_error(403, "Forbidden access")
253
254 def log_message(self, format, *args):
255 pass
256
257
Berker Peksagad324f62014-06-29 15:54:56 +0300258@unittest.skipUnless(threading, 'threading required for this test')
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700259class PasswordProtectedSiteTestCase(unittest.TestCase):
260
261 def setUp(self):
262 self.server = HTTPServer((support.HOST, 0), RobotHandler)
263
264 self.t = threading.Thread(
265 name='HTTPServer serving',
266 target=self.server.serve_forever,
267 # Short poll interval to make the test finish quickly.
268 # Time between requests is short enough that we won't wake
269 # up spuriously too many times.
270 kwargs={'poll_interval':0.01})
271 self.t.daemon = True # In case this function raises.
272 self.t.start()
273
274 def tearDown(self):
275 self.server.shutdown()
276 self.t.join()
277 self.server.server_close()
278
Berker Peksag2a9f5ed2016-09-11 15:17:53 +0300279 @support.reap_threads
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000280 def testPasswordProtectedSite(self):
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700281 addr = self.server.server_address
282 url = 'http://' + support.HOST + ':' + str(addr[1])
283 robots_url = url + "/robots.txt"
284 parser = urllib.robotparser.RobotFileParser()
285 parser.set_url(url)
286 parser.read()
287 self.assertFalse(parser.can_fetch("*", robots_url))
288
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700289
290class NetworkTestCase(unittest.TestCase):
Thomas Wouters47b49bf2007-08-30 22:15:33 +0000291
Berker Peksag2a8d7f12016-09-18 11:21:57 +0300292 base_url = 'http://www.pythontest.net/'
293 robots_txt = '{}elsewhere/robots.txt'.format(base_url)
294
295 @classmethod
296 def setUpClass(cls):
Florent Xicluna41fe6152010-04-02 18:52:12 +0000297 support.requires('network')
Berker Peksag2a8d7f12016-09-18 11:21:57 +0300298 with support.transient_internet(cls.base_url):
299 cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt)
300 cls.parser.read()
301
302 def url(self, path):
303 return '{}{}{}'.format(
304 self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
305 )
306
307 def test_basic(self):
308 self.assertFalse(self.parser.disallow_all)
309 self.assertFalse(self.parser.allow_all)
310 self.assertGreater(self.parser.mtime(), 0)
311 self.assertFalse(self.parser.crawl_delay('*'))
312 self.assertFalse(self.parser.request_rate('*'))
313
314 def test_can_fetch(self):
315 self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
316 self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
317 self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
318 self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
319 self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
320 self.assertTrue(self.parser.can_fetch('*', self.base_url))
321
322 def test_read_404(self):
323 parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt'))
324 parser.read()
325 self.assertTrue(parser.allow_all)
326 self.assertFalse(parser.disallow_all)
327 self.assertEqual(parser.mtime(), 0)
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300328 self.assertIsNone(parser.crawl_delay('*'))
329 self.assertIsNone(parser.request_rate('*'))
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000330
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000331if __name__=='__main__':
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200332 unittest.main()