blob: 90b30722da020eecaf8c52986776adac473549a4 [file] [log] [blame]
Guido van Rossum34d19282007-08-09 01:03:29 +00001import io
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002import unittest
3import urllib.robotparser
Berker Peksag960e8482015-10-08 12:27:06 +03004from collections import namedtuple
Antoine Pitrou95531ea2011-07-08 19:43:51 +02005from urllib.error import URLError, HTTPError
6from urllib.request import urlopen
Benjamin Petersonee8712c2008-05-20 21:35:26 +00007from test import support
Senthil Kumaran601d6ec2014-06-25 02:58:15 -07008from http.server import BaseHTTPRequestHandler, HTTPServer
Berker Peksagad324f62014-06-29 15:54:56 +03009try:
10 import threading
11except ImportError:
12 threading = None
Senthil Kumaran601d6ec2014-06-25 02:58:15 -070013
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000014
15class RobotTestCase(unittest.TestCase):
Berker Peksag960e8482015-10-08 12:27:06 +030016 def __init__(self, index=None, parser=None, url=None, good=None,
17 agent=None, request_rate=None, crawl_delay=None):
Ezio Melotti0fb37ea2013-03-12 07:49:12 +020018 # workaround to make unittest discovery work (see #17066)
19 if not isinstance(index, int):
20 return
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000021 unittest.TestCase.__init__(self)
22 if good:
23 self.str = "RobotTest(%d, good, %s)" % (index, url)
24 else:
25 self.str = "RobotTest(%d, bad, %s)" % (index, url)
26 self.parser = parser
27 self.url = url
28 self.good = good
29 self.agent = agent
Berker Peksag960e8482015-10-08 12:27:06 +030030 self.request_rate = request_rate
31 self.crawl_delay = crawl_delay
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000032
33 def runTest(self):
34 if isinstance(self.url, tuple):
35 agent, url = self.url
36 else:
37 url = self.url
38 agent = self.agent
39 if self.good:
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000040 self.assertTrue(self.parser.can_fetch(agent, url))
Berker Peksag960e8482015-10-08 12:27:06 +030041 self.assertEqual(self.parser.crawl_delay(agent), self.crawl_delay)
42 # if we have actual values for request rate
43 if self.request_rate and self.parser.request_rate(agent):
44 self.assertEqual(
45 self.parser.request_rate(agent).requests,
46 self.request_rate.requests
47 )
48 self.assertEqual(
49 self.parser.request_rate(agent).seconds,
50 self.request_rate.seconds
51 )
52 self.assertEqual(self.parser.request_rate(agent), self.request_rate)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000053 else:
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000054 self.assertFalse(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000055
56 def __str__(self):
57 return self.str
58
59tests = unittest.TestSuite()
60
61def RobotTest(index, robots_txt, good_urls, bad_urls,
Berker Peksag960e8482015-10-08 12:27:06 +030062 request_rate, crawl_delay, agent="test_robotparser"):
Tim Peters863ac442002-04-16 01:38:40 +000063
Guido van Rossum34d19282007-08-09 01:03:29 +000064 lines = io.StringIO(robots_txt).readlines()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000065 parser = urllib.robotparser.RobotFileParser()
Tim Peters863ac442002-04-16 01:38:40 +000066 parser.parse(lines)
67 for url in good_urls:
Berker Peksag960e8482015-10-08 12:27:06 +030068 tests.addTest(RobotTestCase(index, parser, url, 1, agent,
69 request_rate, crawl_delay))
Tim Peters863ac442002-04-16 01:38:40 +000070 for url in bad_urls:
Berker Peksag960e8482015-10-08 12:27:06 +030071 tests.addTest(RobotTestCase(index, parser, url, 0, agent,
72 request_rate, crawl_delay))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000073
74# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
75
76# 1.
77doc = """
78User-agent: *
79Disallow: /cyberworld/map/ # This is an infinite virtual URL space
80Disallow: /tmp/ # these will soon disappear
81Disallow: /foo.html
82"""
83
84good = ['/','/test.html']
85bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
Berker Peksag960e8482015-10-08 12:27:06 +030086request_rate = None
87crawl_delay = None
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000088
Berker Peksag960e8482015-10-08 12:27:06 +030089RobotTest(1, doc, good, bad, request_rate, crawl_delay)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000090
91# 2.
92doc = """
93# robots.txt for http://www.example.com/
94
95User-agent: *
Berker Peksag960e8482015-10-08 12:27:06 +030096Crawl-delay: 1
97Request-rate: 3/15
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000098Disallow: /cyberworld/map/ # This is an infinite virtual URL space
99
100# Cybermapper knows where to go.
101User-agent: cybermapper
102Disallow:
103
104"""
105
106good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
107bad = ['/cyberworld/map/index.html']
Berker Peksag960e8482015-10-08 12:27:06 +0300108request_rate = None # The parameters should be equal to None since they
109crawl_delay = None # don't apply to the cybermapper user agent
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000110
Berker Peksag960e8482015-10-08 12:27:06 +0300111RobotTest(2, doc, good, bad, request_rate, crawl_delay)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000112
113# 3.
114doc = """
115# go away
116User-agent: *
117Disallow: /
118"""
119
120good = []
121bad = ['/cyberworld/map/index.html','/','/tmp/']
Berker Peksag960e8482015-10-08 12:27:06 +0300122request_rate = None
123crawl_delay = None
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000124
Berker Peksag960e8482015-10-08 12:27:06 +0300125RobotTest(3, doc, good, bad, request_rate, crawl_delay)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000126
127# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
128
129# 4.
130doc = """
131User-agent: figtree
Berker Peksag960e8482015-10-08 12:27:06 +0300132Crawl-delay: 3
133Request-rate: 9/30
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000134Disallow: /tmp
135Disallow: /a%3cd.html
136Disallow: /a%2fb.html
137Disallow: /%7ejoe/index.html
138"""
139
140good = [] # XFAIL '/a/b.html'
141bad = ['/tmp','/tmp.html','/tmp/a.html',
142 '/a%3cd.html','/a%3Cd.html','/a%2fb.html',
143 '/~joe/index.html'
144 ]
145
Berker Peksag960e8482015-10-08 12:27:06 +0300146request_rate = namedtuple('req_rate', 'requests seconds')
147request_rate.requests = 9
148request_rate.seconds = 30
149crawl_delay = 3
150request_rate_bad = None # not actually tested, but we still need to parse it
151crawl_delay_bad = None # in order to accommodate the input parameters
152
153
154RobotTest(4, doc, good, bad, request_rate, crawl_delay, 'figtree' )
155RobotTest(5, doc, good, bad, request_rate_bad, crawl_delay_bad,
156 'FigTree Robot libwww-perl/5.04')
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000157
158# 6.
159doc = """
160User-agent: *
161Disallow: /tmp/
162Disallow: /a%3Cd.html
163Disallow: /a/b.html
164Disallow: /%7ejoe/index.html
Berker Peksag960e8482015-10-08 12:27:06 +0300165Crawl-delay: 3
166Request-rate: 9/banana
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000167"""
168
169good = ['/tmp',] # XFAIL: '/a%2fb.html'
170bad = ['/tmp/','/tmp/a.html',
171 '/a%3cd.html','/a%3Cd.html',"/a/b.html",
Tim Peters863ac442002-04-16 01:38:40 +0000172 '/%7Ejoe/index.html']
Berker Peksag960e8482015-10-08 12:27:06 +0300173crawl_delay = 3
174request_rate = None # since request rate has invalid syntax, return None
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000175
Berker Peksag960e8482015-10-08 12:27:06 +0300176RobotTest(6, doc, good, bad, None, None)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000177
178# From bug report #523041
179
180# 7.
181doc = """
182User-Agent: *
183Disallow: /.
Berker Peksag960e8482015-10-08 12:27:06 +0300184Crawl-delay: pears
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000185"""
186
187good = ['/foo.html']
Berker Peksag960e8482015-10-08 12:27:06 +0300188bad = [] # bug report says "/" should be denied, but that is not in the RFC
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000189
Berker Peksag960e8482015-10-08 12:27:06 +0300190crawl_delay = None # since crawl delay has invalid syntax, return None
191request_rate = None
192
193RobotTest(7, doc, good, bad, crawl_delay, request_rate)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000194
Benjamin Petersond6313712008-07-31 16:23:04 +0000195# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
196
197# 8.
198doc = """
199User-agent: Googlebot
200Allow: /folder1/myfile.html
201Disallow: /folder1/
Berker Peksag960e8482015-10-08 12:27:06 +0300202Request-rate: whale/banana
Benjamin Petersond6313712008-07-31 16:23:04 +0000203"""
204
205good = ['/folder1/myfile.html']
206bad = ['/folder1/anotherfile.html']
Berker Peksag960e8482015-10-08 12:27:06 +0300207crawl_delay = None
208request_rate = None # invalid syntax, return none
Benjamin Petersond6313712008-07-31 16:23:04 +0000209
Berker Peksag960e8482015-10-08 12:27:06 +0300210RobotTest(8, doc, good, bad, crawl_delay, request_rate, agent="Googlebot")
Benjamin Petersond6313712008-07-31 16:23:04 +0000211
212# 9. This file is incorrect because "Googlebot" is a substring of
213# "Googlebot-Mobile", so test 10 works just like test 9.
214doc = """
215User-agent: Googlebot
216Disallow: /
217
218User-agent: Googlebot-Mobile
219Allow: /
220"""
221
222good = []
223bad = ['/something.jpg']
224
Berker Peksag960e8482015-10-08 12:27:06 +0300225RobotTest(9, doc, good, bad, None, None, agent="Googlebot")
Benjamin Petersond6313712008-07-31 16:23:04 +0000226
227good = []
228bad = ['/something.jpg']
229
Berker Peksag960e8482015-10-08 12:27:06 +0300230RobotTest(10, doc, good, bad, None, None, agent="Googlebot-Mobile")
Benjamin Petersond6313712008-07-31 16:23:04 +0000231
232# 11. Get the order correct.
233doc = """
234User-agent: Googlebot-Mobile
235Allow: /
236
237User-agent: Googlebot
238Disallow: /
239"""
240
241good = []
242bad = ['/something.jpg']
243
Berker Peksag960e8482015-10-08 12:27:06 +0300244RobotTest(11, doc, good, bad, None, None, agent="Googlebot")
Benjamin Petersond6313712008-07-31 16:23:04 +0000245
246good = ['/something.jpg']
247bad = []
248
Berker Peksag960e8482015-10-08 12:27:06 +0300249RobotTest(12, doc, good, bad, None, None, agent="Googlebot-Mobile")
Benjamin Petersond6313712008-07-31 16:23:04 +0000250
251
252# 13. Google also got the order wrong in #8. You need to specify the
253# URLs from more specific to more general.
254doc = """
255User-agent: Googlebot
256Allow: /folder1/myfile.html
257Disallow: /folder1/
258"""
259
260good = ['/folder1/myfile.html']
261bad = ['/folder1/anotherfile.html']
262
Berker Peksag960e8482015-10-08 12:27:06 +0300263RobotTest(13, doc, good, bad, None, None, agent="googlebot")
Benjamin Petersond6313712008-07-31 16:23:04 +0000264
265
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000266# 14. For issue #6325 (query string support)
267doc = """
268User-agent: *
269Disallow: /some/path?name=value
270"""
271
272good = ['/some/path']
273bad = ['/some/path?name=value']
274
Berker Peksag960e8482015-10-08 12:27:06 +0300275RobotTest(14, doc, good, bad, None, None)
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000276
Georg Brandl0a0fc072010-07-29 17:55:01 +0000277# 15. For issue #4108 (obey first * entry)
278doc = """
279User-agent: *
280Disallow: /some/path
281
282User-agent: *
283Disallow: /another/path
284"""
285
286good = ['/another/path']
287bad = ['/some/path']
288
Berker Peksag960e8482015-10-08 12:27:06 +0300289RobotTest(15, doc, good, bad, None, None)
Georg Brandl0a0fc072010-07-29 17:55:01 +0000290
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700291# 16. Empty query (issue #17403). Normalizing the url first.
292doc = """
293User-agent: *
294Allow: /some/path?
295Disallow: /another/path?
296"""
297
298good = ['/some/path?']
299bad = ['/another/path?']
300
Berker Peksag960e8482015-10-08 12:27:06 +0300301RobotTest(16, doc, good, bad, None, None)
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700302
Benjamin Petersond6313712008-07-31 16:23:04 +0000303
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700304class RobotHandler(BaseHTTPRequestHandler):
305
306 def do_GET(self):
307 self.send_error(403, "Forbidden access")
308
309 def log_message(self, format, *args):
310 pass
311
312
Berker Peksagad324f62014-06-29 15:54:56 +0300313@unittest.skipUnless(threading, 'threading required for this test')
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700314class PasswordProtectedSiteTestCase(unittest.TestCase):
315
316 def setUp(self):
317 self.server = HTTPServer((support.HOST, 0), RobotHandler)
318
319 self.t = threading.Thread(
320 name='HTTPServer serving',
321 target=self.server.serve_forever,
322 # Short poll interval to make the test finish quickly.
323 # Time between requests is short enough that we won't wake
324 # up spuriously too many times.
325 kwargs={'poll_interval':0.01})
326 self.t.daemon = True # In case this function raises.
327 self.t.start()
328
329 def tearDown(self):
330 self.server.shutdown()
331 self.t.join()
332 self.server.server_close()
333
334 def runTest(self):
335 self.testPasswordProtectedSite()
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000336
337 def testPasswordProtectedSite(self):
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700338 addr = self.server.server_address
339 url = 'http://' + support.HOST + ':' + str(addr[1])
340 robots_url = url + "/robots.txt"
341 parser = urllib.robotparser.RobotFileParser()
342 parser.set_url(url)
343 parser.read()
344 self.assertFalse(parser.can_fetch("*", robots_url))
345
346 def __str__(self):
347 return '%s' % self.__class__.__name__
348
349class NetworkTestCase(unittest.TestCase):
Thomas Wouters47b49bf2007-08-30 22:15:33 +0000350
Georg Brandl89e56712014-02-23 08:45:15 +0100351 @unittest.skip('does not handle the gzip encoding delivered by pydotorg')
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000352 def testPythonOrg(self):
Florent Xicluna41fe6152010-04-02 18:52:12 +0000353 support.requires('network')
Antoine Pitrou8bc09032010-09-07 21:09:09 +0000354 with support.transient_internet('www.python.org'):
355 parser = urllib.robotparser.RobotFileParser(
356 "http://www.python.org/robots.txt")
357 parser.read()
358 self.assertTrue(
359 parser.can_fetch("*", "http://www.python.org/robots.txt"))
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000360
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200361def load_tests(loader, suite, pattern):
362 suite = unittest.makeSuite(NetworkTestCase)
363 suite.addTest(tests)
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700364 suite.addTest(PasswordProtectedSiteTestCase())
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200365 return suite
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000366
367if __name__=='__main__':
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200368 unittest.main()