blob: d4bf45376acca31093a094423c48ef8731d800e0 [file] [log] [blame]
Guido van Rossum34d19282007-08-09 01:03:29 +00001import io
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002import unittest
3import urllib.robotparser
Berker Peksag960e8482015-10-08 12:27:06 +03004from collections import namedtuple
Benjamin Petersonee8712c2008-05-20 21:35:26 +00005from test import support
Senthil Kumaran601d6ec2014-06-25 02:58:15 -07006from http.server import BaseHTTPRequestHandler, HTTPServer
Berker Peksagad324f62014-06-29 15:54:56 +03007try:
8 import threading
9except ImportError:
10 threading = None
Senthil Kumaran601d6ec2014-06-25 02:58:15 -070011
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000012
Berker Peksag4da0fd02016-09-11 14:53:16 +030013class BaseRobotTest:
14 robots_txt = ''
15 agent = 'test_robotparser'
16 good = []
17 bad = []
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000018
Berker Peksag4da0fd02016-09-11 14:53:16 +030019 def setUp(self):
20 lines = io.StringIO(self.robots_txt).readlines()
21 self.parser = urllib.robotparser.RobotFileParser()
22 self.parser.parse(lines)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000023
Berker Peksag4da0fd02016-09-11 14:53:16 +030024 def get_agent_and_url(self, url):
25 if isinstance(url, tuple):
26 agent, url = url
27 return agent, url
28 return self.agent, url
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000029
Berker Peksag4da0fd02016-09-11 14:53:16 +030030 def test_good_urls(self):
31 for url in self.good:
32 agent, url = self.get_agent_and_url(url)
33 with self.subTest(url=url, agent=agent):
34 self.assertTrue(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000035
Berker Peksag4da0fd02016-09-11 14:53:16 +030036 def test_bad_urls(self):
37 for url in self.bad:
38 agent, url = self.get_agent_and_url(url)
39 with self.subTest(url=url, agent=agent):
40 self.assertFalse(self.parser.can_fetch(agent, url))
Tim Peters863ac442002-04-16 01:38:40 +000041
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000042
Berker Peksag4da0fd02016-09-11 14:53:16 +030043class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
44 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000045User-agent: *
46Disallow: /cyberworld/map/ # This is an infinite virtual URL space
47Disallow: /tmp/ # these will soon disappear
48Disallow: /foo.html
Berker Peksag4da0fd02016-09-11 14:53:16 +030049 """
50 good = ['/', '/test.html']
51 bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000052
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000053
Berker Peksag4da0fd02016-09-11 14:53:16 +030054class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
55 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000056# robots.txt for http://www.example.com/
57
58User-agent: *
Berker Peksag960e8482015-10-08 12:27:06 +030059Crawl-delay: 1
60Request-rate: 3/15
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000061Disallow: /cyberworld/map/ # This is an infinite virtual URL space
62
63# Cybermapper knows where to go.
64User-agent: cybermapper
65Disallow:
Berker Peksag4da0fd02016-09-11 14:53:16 +030066 """
67 good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
68 bad = ['/cyberworld/map/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000069
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000070
Berker Peksag4da0fd02016-09-11 14:53:16 +030071class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
72 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000073# go away
74User-agent: *
75Disallow: /
Berker Peksag4da0fd02016-09-11 14:53:16 +030076 """
77 good = []
78 bad = ['/cyberworld/map/index.html', '/', '/tmp/']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000079
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000080
Berker Peksag4da0fd02016-09-11 14:53:16 +030081class CrawlDelayAndRequestRateTest(BaseRobotTest, unittest.TestCase):
82 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000083User-agent: figtree
Berker Peksag960e8482015-10-08 12:27:06 +030084Crawl-delay: 3
85Request-rate: 9/30
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000086Disallow: /tmp
87Disallow: /a%3cd.html
88Disallow: /a%2fb.html
89Disallow: /%7ejoe/index.html
Berker Peksag4da0fd02016-09-11 14:53:16 +030090 """
91 agent = 'figtree'
92 request_rate = namedtuple('req_rate', 'requests seconds')(9, 30)
93 crawl_delay = 3
94 good = [('figtree', '/foo.html')]
95 bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
96 '/a%2fb.html', '/~joe/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000097
Berker Peksag4da0fd02016-09-11 14:53:16 +030098 def test_request_rate(self):
99 for url in self.good:
100 agent, url = self.get_agent_and_url(url)
101 with self.subTest(url=url, agent=agent):
102 if self.crawl_delay:
103 self.assertEqual(
104 self.parser.crawl_delay(agent), self.crawl_delay
105 )
106 if self.request_rate and self.parser.request_rate(agent):
107 self.assertEqual(
108 self.parser.request_rate(agent).requests,
109 self.request_rate.requests
110 )
111 self.assertEqual(
112 self.parser.request_rate(agent).seconds,
113 self.request_rate.seconds
114 )
Berker Peksag960e8482015-10-08 12:27:06 +0300115
116
Berker Peksag4da0fd02016-09-11 14:53:16 +0300117class DifferentAgentTest(CrawlDelayAndRequestRateTest):
118 agent = 'FigTree Robot libwww-perl/5.04'
119 # these are not actually tested, but we still need to parse it
120 # in order to accommodate the input parameters
121 request_rate = None
122 crawl_delay = None
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000123
Berker Peksag4da0fd02016-09-11 14:53:16 +0300124
125class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
126 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000127User-agent: *
128Disallow: /tmp/
129Disallow: /a%3Cd.html
130Disallow: /a/b.html
131Disallow: /%7ejoe/index.html
Berker Peksag960e8482015-10-08 12:27:06 +0300132Crawl-delay: 3
133Request-rate: 9/banana
Berker Peksag4da0fd02016-09-11 14:53:16 +0300134 """
135 good = ['/tmp']
136 bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
137 '/%7Ejoe/index.html']
138 crawl_delay = 3
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000139
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000140
Berker Peksag4da0fd02016-09-11 14:53:16 +0300141class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
142 # From bug report #523041
143 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000144User-Agent: *
145Disallow: /.
Berker Peksag960e8482015-10-08 12:27:06 +0300146Crawl-delay: pears
Berker Peksag4da0fd02016-09-11 14:53:16 +0300147 """
148 good = ['/foo.html']
149 # bug report says "/" should be denied, but that is not in the RFC
150 bad = []
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000151
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000152
Berker Peksag4da0fd02016-09-11 14:53:16 +0300153class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
154 # also test that Allow and Diasallow works well with each other
155 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000156User-agent: Googlebot
157Allow: /folder1/myfile.html
158Disallow: /folder1/
Berker Peksag960e8482015-10-08 12:27:06 +0300159Request-rate: whale/banana
Berker Peksag4da0fd02016-09-11 14:53:16 +0300160 """
161 agent = 'Googlebot'
162 good = ['/folder1/myfile.html']
163 bad = ['/folder1/anotherfile.html']
Benjamin Petersond6313712008-07-31 16:23:04 +0000164
Benjamin Petersond6313712008-07-31 16:23:04 +0000165
Berker Peksag4da0fd02016-09-11 14:53:16 +0300166class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
167 # the order of User-agent should be correct. note
168 # that this file is incorrect because "Googlebot" is a
169 # substring of "Googlebot-Mobile"
170 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000171User-agent: Googlebot
172Disallow: /
173
174User-agent: Googlebot-Mobile
175Allow: /
Berker Peksag4da0fd02016-09-11 14:53:16 +0300176 """
177 agent = 'Googlebot'
178 bad = ['/something.jpg']
Benjamin Petersond6313712008-07-31 16:23:04 +0000179
180
Berker Peksag4da0fd02016-09-11 14:53:16 +0300181class UserAgentGoogleMobileTest(UserAgentOrderingTest):
182 agent = 'Googlebot-Mobile'
183
184
185class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
186 # Google also got the order wrong. You need
187 # to specify the URLs from more specific to more general
188 robots_txt = """\
Benjamin Petersond6313712008-07-31 16:23:04 +0000189User-agent: Googlebot
190Allow: /folder1/myfile.html
191Disallow: /folder1/
Berker Peksag4da0fd02016-09-11 14:53:16 +0300192 """
193 agent = 'googlebot'
194 good = ['/folder1/myfile.html']
195 bad = ['/folder1/anotherfile.html']
Benjamin Petersond6313712008-07-31 16:23:04 +0000196
197
Berker Peksag4da0fd02016-09-11 14:53:16 +0300198class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
199 # see issue #6325 for details
200 robots_txt = """\
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000201User-agent: *
202Disallow: /some/path?name=value
Berker Peksag4da0fd02016-09-11 14:53:16 +0300203 """
204 good = ['/some/path']
205 bad = ['/some/path?name=value']
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000206
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000207
Berker Peksag4da0fd02016-09-11 14:53:16 +0300208class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
209 # obey first * entry (#4108)
210 robots_txt = """\
Georg Brandl0a0fc072010-07-29 17:55:01 +0000211User-agent: *
212Disallow: /some/path
213
214User-agent: *
215Disallow: /another/path
Berker Peksag4da0fd02016-09-11 14:53:16 +0300216 """
217 good = ['/another/path']
218 bad = ['/some/path']
Georg Brandl0a0fc072010-07-29 17:55:01 +0000219
Georg Brandl0a0fc072010-07-29 17:55:01 +0000220
Berker Peksag4da0fd02016-09-11 14:53:16 +0300221class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
222 # normalize the URL first (#17403)
223 robots_txt = """\
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700224User-agent: *
225Allow: /some/path?
226Disallow: /another/path?
Berker Peksag4da0fd02016-09-11 14:53:16 +0300227 """
228 good = ['/some/path?']
229 bad = ['/another/path?']
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700230
Benjamin Petersond6313712008-07-31 16:23:04 +0000231
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700232class RobotHandler(BaseHTTPRequestHandler):
233
234 def do_GET(self):
235 self.send_error(403, "Forbidden access")
236
237 def log_message(self, format, *args):
238 pass
239
240
Berker Peksagad324f62014-06-29 15:54:56 +0300241@unittest.skipUnless(threading, 'threading required for this test')
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700242class PasswordProtectedSiteTestCase(unittest.TestCase):
243
244 def setUp(self):
245 self.server = HTTPServer((support.HOST, 0), RobotHandler)
246
247 self.t = threading.Thread(
248 name='HTTPServer serving',
249 target=self.server.serve_forever,
250 # Short poll interval to make the test finish quickly.
251 # Time between requests is short enough that we won't wake
252 # up spuriously too many times.
253 kwargs={'poll_interval':0.01})
254 self.t.daemon = True # In case this function raises.
255 self.t.start()
256
257 def tearDown(self):
258 self.server.shutdown()
259 self.t.join()
260 self.server.server_close()
261
Berker Peksag2a9f5ed2016-09-11 15:17:53 +0300262 @support.reap_threads
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000263 def testPasswordProtectedSite(self):
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700264 addr = self.server.server_address
265 url = 'http://' + support.HOST + ':' + str(addr[1])
266 robots_url = url + "/robots.txt"
267 parser = urllib.robotparser.RobotFileParser()
268 parser.set_url(url)
269 parser.read()
270 self.assertFalse(parser.can_fetch("*", robots_url))
271
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700272
273class NetworkTestCase(unittest.TestCase):
Thomas Wouters47b49bf2007-08-30 22:15:33 +0000274
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000275 def testPythonOrg(self):
Florent Xicluna41fe6152010-04-02 18:52:12 +0000276 support.requires('network')
Antoine Pitrou8bc09032010-09-07 21:09:09 +0000277 with support.transient_internet('www.python.org'):
278 parser = urllib.robotparser.RobotFileParser(
Berker Peksaga3c17282016-09-11 15:46:47 +0300279 "http://www.python.org/robots.txt")
Antoine Pitrou8bc09032010-09-07 21:09:09 +0000280 parser.read()
281 self.assertTrue(
282 parser.can_fetch("*", "http://www.python.org/robots.txt"))
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000283
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000284if __name__=='__main__':
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200285 unittest.main()