blob: 8ed5d89036190d20578c657b73ec4d2f635f5b01 [file] [log] [blame]
Victor Stinner668489a2017-07-05 10:00:33 +02001import os
2import robotparser
3import unittest
4from test import support
5from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
6import StringIO
Ned Deilyc7275332014-03-26 23:25:02 -07007try:
Victor Stinner668489a2017-07-05 10:00:33 +02008 import threading
Ned Deilyc7275332014-03-26 23:25:02 -07009except ImportError:
Victor Stinner668489a2017-07-05 10:00:33 +020010 threading = None
Ned Deilyc7275332014-03-26 23:25:02 -070011
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000012
Victor Stinner668489a2017-07-05 10:00:33 +020013class BaseRobotTest:
14 robots_txt = ''
15 agent = 'test_robotparser'
16 good = []
17 bad = []
18
19 def setUp(self):
20 lines = StringIO.StringIO(self.robots_txt).readlines()
21 self.parser = robotparser.RobotFileParser()
22 self.parser.parse(lines)
23
24 def get_agent_and_url(self, url):
25 if isinstance(url, tuple):
26 agent, url = url
27 return agent, url
28 return self.agent, url
29
30 def test_good_urls(self):
31 for url in self.good:
32 agent, url = self.get_agent_and_url(url)
Benjamin Peterson5c8da862009-06-30 22:57:08 +000033 self.assertTrue(self.parser.can_fetch(agent, url))
Victor Stinner668489a2017-07-05 10:00:33 +020034
35 def test_bad_urls(self):
36 for url in self.bad:
37 agent, url = self.get_agent_and_url(url)
Benjamin Peterson5c8da862009-06-30 22:57:08 +000038 self.assertFalse(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000039
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000040
Victor Stinner668489a2017-07-05 10:00:33 +020041class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
42 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000043User-agent: *
44Disallow: /cyberworld/map/ # This is an infinite virtual URL space
45Disallow: /tmp/ # these will soon disappear
46Disallow: /foo.html
Victor Stinner668489a2017-07-05 10:00:33 +020047 """
48 good = ['/', '/test.html']
49 bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000050
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000051
Victor Stinner668489a2017-07-05 10:00:33 +020052class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
53 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000054# go away
55User-agent: *
56Disallow: /
Victor Stinner668489a2017-07-05 10:00:33 +020057 """
58 good = []
59 bad = ['/cyberworld/map/index.html', '/', '/tmp/']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000060
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000061
Victor Stinner668489a2017-07-05 10:00:33 +020062class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
63 # the order of User-agent should be correct. note
64 # that this file is incorrect because "Googlebot" is a
65 # substring of "Googlebot-Mobile"
66 robots_txt = """\
Skip Montanaro1ef19f02008-07-27 00:49:02 +000067User-agent: Googlebot
68Disallow: /
69
70User-agent: Googlebot-Mobile
71Allow: /
Victor Stinner668489a2017-07-05 10:00:33 +020072 """
73 agent = 'Googlebot'
74 bad = ['/something.jpg']
Skip Montanaro1ef19f02008-07-27 00:49:02 +000075
76
Victor Stinner668489a2017-07-05 10:00:33 +020077class UserAgentGoogleMobileTest(UserAgentOrderingTest):
78 agent = 'Googlebot-Mobile'
79
80
81class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
82 # Google also got the order wrong. You need
83 # to specify the URLs from more specific to more general
84 robots_txt = """\
Skip Montanaro1ef19f02008-07-27 00:49:02 +000085User-agent: Googlebot
86Allow: /folder1/myfile.html
87Disallow: /folder1/
Victor Stinner668489a2017-07-05 10:00:33 +020088 """
89 agent = 'googlebot'
90 good = ['/folder1/myfile.html']
91 bad = ['/folder1/anotherfile.html']
Skip Montanaro1ef19f02008-07-27 00:49:02 +000092
93
Victor Stinner668489a2017-07-05 10:00:33 +020094class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
95 # see issue #6325 for details
96 robots_txt = """\
Senthil Kumarana4f79f92010-07-28 16:35:35 +000097User-agent: *
98Disallow: /some/path?name=value
Victor Stinner668489a2017-07-05 10:00:33 +020099 """
100 good = ['/some/path']
101 bad = ['/some/path?name=value']
Senthil Kumarana4f79f92010-07-28 16:35:35 +0000102
Senthil Kumarana4f79f92010-07-28 16:35:35 +0000103
Victor Stinner668489a2017-07-05 10:00:33 +0200104class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
105 # obey first * entry (#4108)
106 robots_txt = """\
Georg Brandl2bd953e2010-08-01 20:59:03 +0000107User-agent: *
108Disallow: /some/path
109
110User-agent: *
111Disallow: /another/path
Victor Stinner668489a2017-07-05 10:00:33 +0200112 """
113 good = ['/another/path']
114 bad = ['/some/path']
Georg Brandl2bd953e2010-08-01 20:59:03 +0000115
Georg Brandl2bd953e2010-08-01 20:59:03 +0000116
Victor Stinner668489a2017-07-05 10:00:33 +0200117class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
118 # normalize the URL first (#17403)
119 robots_txt = """\
Senthil Kumaran2c4810e2013-05-29 05:58:47 -0700120User-agent: *
121Allow: /some/path?
122Disallow: /another/path?
Victor Stinner668489a2017-07-05 10:00:33 +0200123 """
124 good = ['/some/path?']
125 bad = ['/another/path?']
Senthil Kumaran2c4810e2013-05-29 05:58:47 -0700126
Senthil Kumaran2c4810e2013-05-29 05:58:47 -0700127
Victor Stinner668489a2017-07-05 10:00:33 +0200128class DefaultEntryTest(BaseRobotTest, unittest.TestCase):
129 robots_txt = """\
130User-agent: *
131Crawl-delay: 1
132Request-rate: 3/15
133Disallow: /cyberworld/map/
134 """
135 good = ['/', '/test.html']
136 bad = ['/cyberworld/map/index.html']
137
138
139class RobotHandler(BaseHTTPRequestHandler):
140
141 def do_GET(self):
142 self.send_error(403, "Forbidden access")
143
144 def log_message(self, format, *args):
145 pass
146
147
148@unittest.skipUnless(threading, 'threading required for this test')
149class PasswordProtectedSiteTestCase(unittest.TestCase):
150
151 def setUp(self):
152 self.server = HTTPServer((support.HOST, 0), RobotHandler)
153
154 self.t = threading.Thread(
155 name='HTTPServer serving',
156 target=self.server.serve_forever,
157 # Short poll interval to make the test finish quickly.
158 # Time between requests is short enough that we won't wake
159 # up spuriously too many times.
160 kwargs={'poll_interval':0.01})
161 self.t.daemon = True # In case this function raises.
162 self.t.start()
163
164 def tearDown(self):
165 self.server.shutdown()
166 self.t.join()
167 self.server.server_close()
168
169 @support.reap_threads
170 def testPasswordProtectedSite(self):
171 addr = self.server.server_address
172 url = 'http://' + support.HOST + ':' + str(addr[1])
173 robots_url = url + "/robots.txt"
174 parser = robotparser.RobotFileParser()
175 parser.set_url(url)
176 parser.read()
177 self.assertFalse(parser.can_fetch("*", robots_url))
Senthil Kumaran2c4810e2013-05-29 05:58:47 -0700178
Skip Montanaro1ef19f02008-07-27 00:49:02 +0000179
Florent Xiclunaf37592f2010-04-02 17:26:42 +0000180class NetworkTestCase(unittest.TestCase):
181
Victor Stinner668489a2017-07-05 10:00:33 +0200182 base_url = 'http://www.pythontest.net/'
183 robots_txt = '{}elsewhere/robots.txt'.format(base_url)
Skip Montanaro1a413132007-08-28 23:22:52 +0000184
Victor Stinner668489a2017-07-05 10:00:33 +0200185 @classmethod
186 def setUpClass(cls):
187 support.requires('network')
188 with support.transient_internet(cls.base_url):
189 cls.parser = robotparser.RobotFileParser(cls.robots_txt)
190 cls.parser.read()
191
192 def url(self, path):
193 return '{}{}{}'.format(
194 self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
195 )
196
197 def test_basic(self):
198 self.assertFalse(self.parser.disallow_all)
199 self.assertFalse(self.parser.allow_all)
200 self.assertGreater(self.parser.mtime(), 0)
201
202 def test_can_fetch(self):
203 self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
204 self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
205 self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
206 self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
207 self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
208 self.assertTrue(self.parser.can_fetch('*', self.base_url))
209
210 def test_read_404(self):
211 parser = robotparser.RobotFileParser(self.url('i-robot.txt'))
212 parser.read()
213 self.assertTrue(parser.allow_all)
214 self.assertFalse(parser.disallow_all)
215 self.assertEqual(parser.mtime(), 0)
Florent Xiclunaf37592f2010-04-02 17:26:42 +0000216
217
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000218def test_main():
Victor Stinner668489a2017-07-05 10:00:33 +0200219 support.run_unittest(
220 UserAgentWildcardTest,
221 RejectAllRobotsTest,
222 UserAgentOrderingTest,
223 UserAgentGoogleMobileTest,
224 GoogleURLOrderingTest,
225 DisallowQueryStringTest,
226 UseFirstUserAgentWildcardTest,
227 EmptyQueryStringTest,
228 DefaultEntryTest,
229 PasswordProtectedSiteTestCase,
230 NetworkTestCase)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000231
Victor Stinner668489a2017-07-05 10:00:33 +0200232
233if __name__ == "__main__":
Collin Winterc2898c52007-04-25 17:29:52 +0000234 test_main()