Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 1 | import os |
| 2 | import robotparser |
| 3 | import unittest |
| 4 | from test import support |
| 5 | from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer |
| 6 | import StringIO |
Ned Deily | c727533 | 2014-03-26 23:25:02 -0700 | [diff] [blame] | 7 | try: |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 8 | import threading |
Ned Deily | c727533 | 2014-03-26 23:25:02 -0700 | [diff] [blame] | 9 | except ImportError: |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 10 | threading = None |
Ned Deily | c727533 | 2014-03-26 23:25:02 -0700 | [diff] [blame] | 11 | |
Martin v. Löwis | 1c63f6e | 2002-02-28 15:24:47 +0000 | [diff] [blame] | 12 | |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 13 | class BaseRobotTest: |
| 14 | robots_txt = '' |
| 15 | agent = 'test_robotparser' |
| 16 | good = [] |
| 17 | bad = [] |
| 18 | |
| 19 | def setUp(self): |
| 20 | lines = StringIO.StringIO(self.robots_txt).readlines() |
| 21 | self.parser = robotparser.RobotFileParser() |
| 22 | self.parser.parse(lines) |
| 23 | |
| 24 | def get_agent_and_url(self, url): |
| 25 | if isinstance(url, tuple): |
| 26 | agent, url = url |
| 27 | return agent, url |
| 28 | return self.agent, url |
| 29 | |
| 30 | def test_good_urls(self): |
| 31 | for url in self.good: |
| 32 | agent, url = self.get_agent_and_url(url) |
Benjamin Peterson | 5c8da86 | 2009-06-30 22:57:08 +0000 | [diff] [blame] | 33 | self.assertTrue(self.parser.can_fetch(agent, url)) |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 34 | |
| 35 | def test_bad_urls(self): |
| 36 | for url in self.bad: |
| 37 | agent, url = self.get_agent_and_url(url) |
Benjamin Peterson | 5c8da86 | 2009-06-30 22:57:08 +0000 | [diff] [blame] | 38 | self.assertFalse(self.parser.can_fetch(agent, url)) |
Martin v. Löwis | 1c63f6e | 2002-02-28 15:24:47 +0000 | [diff] [blame] | 39 | |
Martin v. Löwis | 1c63f6e | 2002-02-28 15:24:47 +0000 | [diff] [blame] | 40 | |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 41 | class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase): |
| 42 | robots_txt = """\ |
Martin v. Löwis | 1c63f6e | 2002-02-28 15:24:47 +0000 | [diff] [blame] | 43 | User-agent: * |
| 44 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space |
| 45 | Disallow: /tmp/ # these will soon disappear |
| 46 | Disallow: /foo.html |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 47 | """ |
| 48 | good = ['/', '/test.html'] |
| 49 | bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html'] |
Martin v. Löwis | 1c63f6e | 2002-02-28 15:24:47 +0000 | [diff] [blame] | 50 | |
Martin v. Löwis | 1c63f6e | 2002-02-28 15:24:47 +0000 | [diff] [blame] | 51 | |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 52 | class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase): |
| 53 | robots_txt = """\ |
Martin v. Löwis | 1c63f6e | 2002-02-28 15:24:47 +0000 | [diff] [blame] | 54 | # go away |
| 55 | User-agent: * |
| 56 | Disallow: / |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 57 | """ |
| 58 | good = [] |
| 59 | bad = ['/cyberworld/map/index.html', '/', '/tmp/'] |
Martin v. Löwis | 1c63f6e | 2002-02-28 15:24:47 +0000 | [diff] [blame] | 60 | |
Martin v. Löwis | 1c63f6e | 2002-02-28 15:24:47 +0000 | [diff] [blame] | 61 | |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 62 | class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase): |
| 63 | # the order of User-agent should be correct. note |
| 64 | # that this file is incorrect because "Googlebot" is a |
| 65 | # substring of "Googlebot-Mobile" |
| 66 | robots_txt = """\ |
Skip Montanaro | 1ef19f0 | 2008-07-27 00:49:02 +0000 | [diff] [blame] | 67 | User-agent: Googlebot |
| 68 | Disallow: / |
| 69 | |
| 70 | User-agent: Googlebot-Mobile |
| 71 | Allow: / |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 72 | """ |
| 73 | agent = 'Googlebot' |
| 74 | bad = ['/something.jpg'] |
Skip Montanaro | 1ef19f0 | 2008-07-27 00:49:02 +0000 | [diff] [blame] | 75 | |
| 76 | |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 77 | class UserAgentGoogleMobileTest(UserAgentOrderingTest): |
| 78 | agent = 'Googlebot-Mobile' |
| 79 | |
| 80 | |
| 81 | class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase): |
| 82 | # Google also got the order wrong. You need |
| 83 | # to specify the URLs from more specific to more general |
| 84 | robots_txt = """\ |
Skip Montanaro | 1ef19f0 | 2008-07-27 00:49:02 +0000 | [diff] [blame] | 85 | User-agent: Googlebot |
| 86 | Allow: /folder1/myfile.html |
| 87 | Disallow: /folder1/ |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 88 | """ |
| 89 | agent = 'googlebot' |
| 90 | good = ['/folder1/myfile.html'] |
| 91 | bad = ['/folder1/anotherfile.html'] |
Skip Montanaro | 1ef19f0 | 2008-07-27 00:49:02 +0000 | [diff] [blame] | 92 | |
| 93 | |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 94 | class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase): |
| 95 | # see issue #6325 for details |
| 96 | robots_txt = """\ |
Senthil Kumaran | a4f79f9 | 2010-07-28 16:35:35 +0000 | [diff] [blame] | 97 | User-agent: * |
| 98 | Disallow: /some/path?name=value |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 99 | """ |
| 100 | good = ['/some/path'] |
| 101 | bad = ['/some/path?name=value'] |
Senthil Kumaran | a4f79f9 | 2010-07-28 16:35:35 +0000 | [diff] [blame] | 102 | |
Senthil Kumaran | a4f79f9 | 2010-07-28 16:35:35 +0000 | [diff] [blame] | 103 | |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 104 | class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase): |
| 105 | # obey first * entry (#4108) |
| 106 | robots_txt = """\ |
Georg Brandl | 2bd953e | 2010-08-01 20:59:03 +0000 | [diff] [blame] | 107 | User-agent: * |
| 108 | Disallow: /some/path |
| 109 | |
| 110 | User-agent: * |
| 111 | Disallow: /another/path |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 112 | """ |
| 113 | good = ['/another/path'] |
| 114 | bad = ['/some/path'] |
Georg Brandl | 2bd953e | 2010-08-01 20:59:03 +0000 | [diff] [blame] | 115 | |
Georg Brandl | 2bd953e | 2010-08-01 20:59:03 +0000 | [diff] [blame] | 116 | |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 117 | class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase): |
| 118 | # normalize the URL first (#17403) |
| 119 | robots_txt = """\ |
Senthil Kumaran | 2c4810e | 2013-05-29 05:58:47 -0700 | [diff] [blame] | 120 | User-agent: * |
| 121 | Allow: /some/path? |
| 122 | Disallow: /another/path? |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 123 | """ |
| 124 | good = ['/some/path?'] |
| 125 | bad = ['/another/path?'] |
Senthil Kumaran | 2c4810e | 2013-05-29 05:58:47 -0700 | [diff] [blame] | 126 | |
Senthil Kumaran | 2c4810e | 2013-05-29 05:58:47 -0700 | [diff] [blame] | 127 | |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 128 | class DefaultEntryTest(BaseRobotTest, unittest.TestCase): |
| 129 | robots_txt = """\ |
| 130 | User-agent: * |
| 131 | Crawl-delay: 1 |
| 132 | Request-rate: 3/15 |
| 133 | Disallow: /cyberworld/map/ |
| 134 | """ |
| 135 | good = ['/', '/test.html'] |
| 136 | bad = ['/cyberworld/map/index.html'] |
| 137 | |
| 138 | |
Serhiy Storchaka | 861d384 | 2018-05-15 01:09:47 +0300 | [diff] [blame] | 139 | class StringFormattingTest(BaseRobotTest, unittest.TestCase): |
| 140 | robots_txt = """\ |
| 141 | User-agent: * |
| 142 | Crawl-delay: 1 |
| 143 | Request-rate: 3/15 |
| 144 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space |
| 145 | |
| 146 | # Cybermapper knows where to go. |
| 147 | User-agent: cybermapper |
| 148 | Disallow: /some/path |
| 149 | """ |
| 150 | |
| 151 | expected_output = """\ |
| 152 | User-agent: cybermapper |
| 153 | Disallow: /some/path |
| 154 | |
| 155 | User-agent: * |
| 156 | Disallow: /cyberworld/map/ |
| 157 | |
| 158 | """ |
| 159 | |
| 160 | def test_string_formatting(self): |
| 161 | self.assertEqual(str(self.parser), self.expected_output) |
| 162 | |
| 163 | |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 164 | class RobotHandler(BaseHTTPRequestHandler): |
| 165 | |
| 166 | def do_GET(self): |
| 167 | self.send_error(403, "Forbidden access") |
| 168 | |
| 169 | def log_message(self, format, *args): |
| 170 | pass |
| 171 | |
| 172 | |
| 173 | @unittest.skipUnless(threading, 'threading required for this test') |
| 174 | class PasswordProtectedSiteTestCase(unittest.TestCase): |
| 175 | |
| 176 | def setUp(self): |
| 177 | self.server = HTTPServer((support.HOST, 0), RobotHandler) |
| 178 | |
| 179 | self.t = threading.Thread( |
| 180 | name='HTTPServer serving', |
| 181 | target=self.server.serve_forever, |
| 182 | # Short poll interval to make the test finish quickly. |
| 183 | # Time between requests is short enough that we won't wake |
| 184 | # up spuriously too many times. |
| 185 | kwargs={'poll_interval':0.01}) |
| 186 | self.t.daemon = True # In case this function raises. |
| 187 | self.t.start() |
| 188 | |
| 189 | def tearDown(self): |
| 190 | self.server.shutdown() |
| 191 | self.t.join() |
| 192 | self.server.server_close() |
| 193 | |
| 194 | @support.reap_threads |
| 195 | def testPasswordProtectedSite(self): |
| 196 | addr = self.server.server_address |
| 197 | url = 'http://' + support.HOST + ':' + str(addr[1]) |
| 198 | robots_url = url + "/robots.txt" |
| 199 | parser = robotparser.RobotFileParser() |
| 200 | parser.set_url(url) |
| 201 | parser.read() |
| 202 | self.assertFalse(parser.can_fetch("*", robots_url)) |
Senthil Kumaran | 2c4810e | 2013-05-29 05:58:47 -0700 | [diff] [blame] | 203 | |
Skip Montanaro | 1ef19f0 | 2008-07-27 00:49:02 +0000 | [diff] [blame] | 204 | |
Florent Xicluna | f37592f | 2010-04-02 17:26:42 +0000 | [diff] [blame] | 205 | class NetworkTestCase(unittest.TestCase): |
| 206 | |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 207 | base_url = 'http://www.pythontest.net/' |
| 208 | robots_txt = '{}elsewhere/robots.txt'.format(base_url) |
Skip Montanaro | 1a41313 | 2007-08-28 23:22:52 +0000 | [diff] [blame] | 209 | |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 210 | @classmethod |
| 211 | def setUpClass(cls): |
| 212 | support.requires('network') |
| 213 | with support.transient_internet(cls.base_url): |
| 214 | cls.parser = robotparser.RobotFileParser(cls.robots_txt) |
| 215 | cls.parser.read() |
| 216 | |
| 217 | def url(self, path): |
| 218 | return '{}{}{}'.format( |
| 219 | self.base_url, path, '/' if not os.path.splitext(path)[1] else '' |
| 220 | ) |
| 221 | |
| 222 | def test_basic(self): |
| 223 | self.assertFalse(self.parser.disallow_all) |
| 224 | self.assertFalse(self.parser.allow_all) |
| 225 | self.assertGreater(self.parser.mtime(), 0) |
| 226 | |
| 227 | def test_can_fetch(self): |
| 228 | self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere'))) |
| 229 | self.assertFalse(self.parser.can_fetch('Nutch', self.base_url)) |
| 230 | self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian'))) |
| 231 | self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats'))) |
| 232 | self.assertFalse(self.parser.can_fetch('*', self.url('webstats'))) |
| 233 | self.assertTrue(self.parser.can_fetch('*', self.base_url)) |
| 234 | |
| 235 | def test_read_404(self): |
| 236 | parser = robotparser.RobotFileParser(self.url('i-robot.txt')) |
| 237 | parser.read() |
| 238 | self.assertTrue(parser.allow_all) |
| 239 | self.assertFalse(parser.disallow_all) |
| 240 | self.assertEqual(parser.mtime(), 0) |
Florent Xicluna | f37592f | 2010-04-02 17:26:42 +0000 | [diff] [blame] | 241 | |
| 242 | |
Martin v. Löwis | 1c63f6e | 2002-02-28 15:24:47 +0000 | [diff] [blame] | 243 | def test_main(): |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 244 | support.run_unittest( |
| 245 | UserAgentWildcardTest, |
| 246 | RejectAllRobotsTest, |
| 247 | UserAgentOrderingTest, |
| 248 | UserAgentGoogleMobileTest, |
| 249 | GoogleURLOrderingTest, |
| 250 | DisallowQueryStringTest, |
| 251 | UseFirstUserAgentWildcardTest, |
| 252 | EmptyQueryStringTest, |
| 253 | DefaultEntryTest, |
Serhiy Storchaka | 861d384 | 2018-05-15 01:09:47 +0300 | [diff] [blame] | 254 | StringFormattingTest, |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 255 | PasswordProtectedSiteTestCase, |
| 256 | NetworkTestCase) |
Martin v. Löwis | 1c63f6e | 2002-02-28 15:24:47 +0000 | [diff] [blame] | 257 | |
Victor Stinner | 668489a | 2017-07-05 10:00:33 +0200 | [diff] [blame] | 258 | |
| 259 | if __name__ == "__main__": |
Collin Winter | c2898c5 | 2007-04-25 17:29:52 +0000 | [diff] [blame] | 260 | test_main() |