blob: ba7ccf8b58da24b57718f336156346123f99b8c2 [file] [log] [blame]
Victor Stinner668489a2017-07-05 10:00:33 +02001import os
2import robotparser
3import unittest
4from test import support
5from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
6import StringIO
Ned Deilyc7275332014-03-26 23:25:02 -07007try:
Victor Stinner668489a2017-07-05 10:00:33 +02008 import threading
Ned Deilyc7275332014-03-26 23:25:02 -07009except ImportError:
Victor Stinner668489a2017-07-05 10:00:33 +020010 threading = None
Ned Deilyc7275332014-03-26 23:25:02 -070011
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000012
Victor Stinner668489a2017-07-05 10:00:33 +020013class BaseRobotTest:
14 robots_txt = ''
15 agent = 'test_robotparser'
16 good = []
17 bad = []
18
19 def setUp(self):
20 lines = StringIO.StringIO(self.robots_txt).readlines()
21 self.parser = robotparser.RobotFileParser()
22 self.parser.parse(lines)
23
24 def get_agent_and_url(self, url):
25 if isinstance(url, tuple):
26 agent, url = url
27 return agent, url
28 return self.agent, url
29
30 def test_good_urls(self):
31 for url in self.good:
32 agent, url = self.get_agent_and_url(url)
Benjamin Peterson5c8da862009-06-30 22:57:08 +000033 self.assertTrue(self.parser.can_fetch(agent, url))
Victor Stinner668489a2017-07-05 10:00:33 +020034
35 def test_bad_urls(self):
36 for url in self.bad:
37 agent, url = self.get_agent_and_url(url)
Benjamin Peterson5c8da862009-06-30 22:57:08 +000038 self.assertFalse(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000039
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000040
Victor Stinner668489a2017-07-05 10:00:33 +020041class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
42 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000043User-agent: *
44Disallow: /cyberworld/map/ # This is an infinite virtual URL space
45Disallow: /tmp/ # these will soon disappear
46Disallow: /foo.html
Victor Stinner668489a2017-07-05 10:00:33 +020047 """
48 good = ['/', '/test.html']
49 bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000050
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000051
Victor Stinner668489a2017-07-05 10:00:33 +020052class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
53 robots_txt = """\
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000054# go away
55User-agent: *
56Disallow: /
Victor Stinner668489a2017-07-05 10:00:33 +020057 """
58 good = []
59 bad = ['/cyberworld/map/index.html', '/', '/tmp/']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000060
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000061
Victor Stinner668489a2017-07-05 10:00:33 +020062class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
63 # the order of User-agent should be correct. note
64 # that this file is incorrect because "Googlebot" is a
65 # substring of "Googlebot-Mobile"
66 robots_txt = """\
Skip Montanaro1ef19f02008-07-27 00:49:02 +000067User-agent: Googlebot
68Disallow: /
69
70User-agent: Googlebot-Mobile
71Allow: /
Victor Stinner668489a2017-07-05 10:00:33 +020072 """
73 agent = 'Googlebot'
74 bad = ['/something.jpg']
Skip Montanaro1ef19f02008-07-27 00:49:02 +000075
76
Victor Stinner668489a2017-07-05 10:00:33 +020077class UserAgentGoogleMobileTest(UserAgentOrderingTest):
78 agent = 'Googlebot-Mobile'
79
80
81class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
82 # Google also got the order wrong. You need
83 # to specify the URLs from more specific to more general
84 robots_txt = """\
Skip Montanaro1ef19f02008-07-27 00:49:02 +000085User-agent: Googlebot
86Allow: /folder1/myfile.html
87Disallow: /folder1/
Victor Stinner668489a2017-07-05 10:00:33 +020088 """
89 agent = 'googlebot'
90 good = ['/folder1/myfile.html']
91 bad = ['/folder1/anotherfile.html']
Skip Montanaro1ef19f02008-07-27 00:49:02 +000092
93
Victor Stinner668489a2017-07-05 10:00:33 +020094class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
95 # see issue #6325 for details
96 robots_txt = """\
Senthil Kumarana4f79f92010-07-28 16:35:35 +000097User-agent: *
98Disallow: /some/path?name=value
Victor Stinner668489a2017-07-05 10:00:33 +020099 """
100 good = ['/some/path']
101 bad = ['/some/path?name=value']
Senthil Kumarana4f79f92010-07-28 16:35:35 +0000102
Senthil Kumarana4f79f92010-07-28 16:35:35 +0000103
Victor Stinner668489a2017-07-05 10:00:33 +0200104class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
105 # obey first * entry (#4108)
106 robots_txt = """\
Georg Brandl2bd953e2010-08-01 20:59:03 +0000107User-agent: *
108Disallow: /some/path
109
110User-agent: *
111Disallow: /another/path
Victor Stinner668489a2017-07-05 10:00:33 +0200112 """
113 good = ['/another/path']
114 bad = ['/some/path']
Georg Brandl2bd953e2010-08-01 20:59:03 +0000115
Georg Brandl2bd953e2010-08-01 20:59:03 +0000116
Victor Stinner668489a2017-07-05 10:00:33 +0200117class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
118 # normalize the URL first (#17403)
119 robots_txt = """\
Senthil Kumaran2c4810e2013-05-29 05:58:47 -0700120User-agent: *
121Allow: /some/path?
122Disallow: /another/path?
Victor Stinner668489a2017-07-05 10:00:33 +0200123 """
124 good = ['/some/path?']
125 bad = ['/another/path?']
Senthil Kumaran2c4810e2013-05-29 05:58:47 -0700126
Senthil Kumaran2c4810e2013-05-29 05:58:47 -0700127
Victor Stinner668489a2017-07-05 10:00:33 +0200128class DefaultEntryTest(BaseRobotTest, unittest.TestCase):
129 robots_txt = """\
130User-agent: *
131Crawl-delay: 1
132Request-rate: 3/15
133Disallow: /cyberworld/map/
134 """
135 good = ['/', '/test.html']
136 bad = ['/cyberworld/map/index.html']
137
138
Serhiy Storchaka861d3842018-05-15 01:09:47 +0300139class StringFormattingTest(BaseRobotTest, unittest.TestCase):
140 robots_txt = """\
141User-agent: *
142Crawl-delay: 1
143Request-rate: 3/15
144Disallow: /cyberworld/map/ # This is an infinite virtual URL space
145
146# Cybermapper knows where to go.
147User-agent: cybermapper
148Disallow: /some/path
149 """
150
151 expected_output = """\
152User-agent: cybermapper
153Disallow: /some/path
154
155User-agent: *
156Disallow: /cyberworld/map/
157
158"""
159
160 def test_string_formatting(self):
161 self.assertEqual(str(self.parser), self.expected_output)
162
163
Victor Stinner668489a2017-07-05 10:00:33 +0200164class RobotHandler(BaseHTTPRequestHandler):
165
166 def do_GET(self):
167 self.send_error(403, "Forbidden access")
168
169 def log_message(self, format, *args):
170 pass
171
172
173@unittest.skipUnless(threading, 'threading required for this test')
174class PasswordProtectedSiteTestCase(unittest.TestCase):
175
176 def setUp(self):
177 self.server = HTTPServer((support.HOST, 0), RobotHandler)
178
179 self.t = threading.Thread(
180 name='HTTPServer serving',
181 target=self.server.serve_forever,
182 # Short poll interval to make the test finish quickly.
183 # Time between requests is short enough that we won't wake
184 # up spuriously too many times.
185 kwargs={'poll_interval':0.01})
186 self.t.daemon = True # In case this function raises.
187 self.t.start()
188
189 def tearDown(self):
190 self.server.shutdown()
191 self.t.join()
192 self.server.server_close()
193
194 @support.reap_threads
195 def testPasswordProtectedSite(self):
196 addr = self.server.server_address
197 url = 'http://' + support.HOST + ':' + str(addr[1])
198 robots_url = url + "/robots.txt"
199 parser = robotparser.RobotFileParser()
200 parser.set_url(url)
201 parser.read()
202 self.assertFalse(parser.can_fetch("*", robots_url))
Senthil Kumaran2c4810e2013-05-29 05:58:47 -0700203
Skip Montanaro1ef19f02008-07-27 00:49:02 +0000204
Florent Xiclunaf37592f2010-04-02 17:26:42 +0000205class NetworkTestCase(unittest.TestCase):
206
Victor Stinner668489a2017-07-05 10:00:33 +0200207 base_url = 'http://www.pythontest.net/'
208 robots_txt = '{}elsewhere/robots.txt'.format(base_url)
Skip Montanaro1a413132007-08-28 23:22:52 +0000209
Victor Stinner668489a2017-07-05 10:00:33 +0200210 @classmethod
211 def setUpClass(cls):
212 support.requires('network')
213 with support.transient_internet(cls.base_url):
214 cls.parser = robotparser.RobotFileParser(cls.robots_txt)
215 cls.parser.read()
216
217 def url(self, path):
218 return '{}{}{}'.format(
219 self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
220 )
221
222 def test_basic(self):
223 self.assertFalse(self.parser.disallow_all)
224 self.assertFalse(self.parser.allow_all)
225 self.assertGreater(self.parser.mtime(), 0)
226
227 def test_can_fetch(self):
228 self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
229 self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
230 self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
231 self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
232 self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
233 self.assertTrue(self.parser.can_fetch('*', self.base_url))
234
235 def test_read_404(self):
236 parser = robotparser.RobotFileParser(self.url('i-robot.txt'))
237 parser.read()
238 self.assertTrue(parser.allow_all)
239 self.assertFalse(parser.disallow_all)
240 self.assertEqual(parser.mtime(), 0)
Florent Xiclunaf37592f2010-04-02 17:26:42 +0000241
242
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000243def test_main():
Victor Stinner668489a2017-07-05 10:00:33 +0200244 support.run_unittest(
245 UserAgentWildcardTest,
246 RejectAllRobotsTest,
247 UserAgentOrderingTest,
248 UserAgentGoogleMobileTest,
249 GoogleURLOrderingTest,
250 DisallowQueryStringTest,
251 UseFirstUserAgentWildcardTest,
252 EmptyQueryStringTest,
253 DefaultEntryTest,
Serhiy Storchaka861d3842018-05-15 01:09:47 +0300254 StringFormattingTest,
Victor Stinner668489a2017-07-05 10:00:33 +0200255 PasswordProtectedSiteTestCase,
256 NetworkTestCase)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000257
Victor Stinner668489a2017-07-05 10:00:33 +0200258
259if __name__ == "__main__":
Collin Winterc2898c52007-04-25 17:29:52 +0000260 test_main()