blob: 68a5e9ce2fd0ddad4e620a3cc85ecfe584b71991 [file] [log] [blame]
Guido van Rossum34d19282007-08-09 01:03:29 +00001import io
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002import unittest
3import urllib.robotparser
Antoine Pitrou95531ea2011-07-08 19:43:51 +02004from urllib.error import URLError, HTTPError
5from urllib.request import urlopen
Benjamin Petersonee8712c2008-05-20 21:35:26 +00006from test import support
Senthil Kumaran601d6ec2014-06-25 02:58:15 -07007import threading
8from http.server import BaseHTTPRequestHandler, HTTPServer
9
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000010
11class RobotTestCase(unittest.TestCase):
Ezio Melotti0fb37ea2013-03-12 07:49:12 +020012 def __init__(self, index=None, parser=None, url=None, good=None, agent=None):
13 # workaround to make unittest discovery work (see #17066)
14 if not isinstance(index, int):
15 return
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000016 unittest.TestCase.__init__(self)
17 if good:
18 self.str = "RobotTest(%d, good, %s)" % (index, url)
19 else:
20 self.str = "RobotTest(%d, bad, %s)" % (index, url)
21 self.parser = parser
22 self.url = url
23 self.good = good
24 self.agent = agent
25
26 def runTest(self):
27 if isinstance(self.url, tuple):
28 agent, url = self.url
29 else:
30 url = self.url
31 agent = self.agent
32 if self.good:
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000033 self.assertTrue(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000034 else:
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000035 self.assertFalse(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000036
37 def __str__(self):
38 return self.str
39
40tests = unittest.TestSuite()
41
42def RobotTest(index, robots_txt, good_urls, bad_urls,
43 agent="test_robotparser"):
Tim Peters863ac442002-04-16 01:38:40 +000044
Guido van Rossum34d19282007-08-09 01:03:29 +000045 lines = io.StringIO(robots_txt).readlines()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000046 parser = urllib.robotparser.RobotFileParser()
Tim Peters863ac442002-04-16 01:38:40 +000047 parser.parse(lines)
48 for url in good_urls:
49 tests.addTest(RobotTestCase(index, parser, url, 1, agent))
50 for url in bad_urls:
51 tests.addTest(RobotTestCase(index, parser, url, 0, agent))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000052
53# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
54
55# 1.
56doc = """
57User-agent: *
58Disallow: /cyberworld/map/ # This is an infinite virtual URL space
59Disallow: /tmp/ # these will soon disappear
60Disallow: /foo.html
61"""
62
63good = ['/','/test.html']
64bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
65
66RobotTest(1, doc, good, bad)
67
68# 2.
69doc = """
70# robots.txt for http://www.example.com/
71
72User-agent: *
73Disallow: /cyberworld/map/ # This is an infinite virtual URL space
74
75# Cybermapper knows where to go.
76User-agent: cybermapper
77Disallow:
78
79"""
80
81good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
82bad = ['/cyberworld/map/index.html']
83
84RobotTest(2, doc, good, bad)
85
86# 3.
87doc = """
88# go away
89User-agent: *
90Disallow: /
91"""
92
93good = []
94bad = ['/cyberworld/map/index.html','/','/tmp/']
95
96RobotTest(3, doc, good, bad)
97
98# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
99
100# 4.
101doc = """
102User-agent: figtree
103Disallow: /tmp
104Disallow: /a%3cd.html
105Disallow: /a%2fb.html
106Disallow: /%7ejoe/index.html
107"""
108
109good = [] # XFAIL '/a/b.html'
110bad = ['/tmp','/tmp.html','/tmp/a.html',
111 '/a%3cd.html','/a%3Cd.html','/a%2fb.html',
112 '/~joe/index.html'
113 ]
114
115RobotTest(4, doc, good, bad, 'figtree')
116RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
117
118# 6.
119doc = """
120User-agent: *
121Disallow: /tmp/
122Disallow: /a%3Cd.html
123Disallow: /a/b.html
124Disallow: /%7ejoe/index.html
125"""
126
127good = ['/tmp',] # XFAIL: '/a%2fb.html'
128bad = ['/tmp/','/tmp/a.html',
129 '/a%3cd.html','/a%3Cd.html',"/a/b.html",
Tim Peters863ac442002-04-16 01:38:40 +0000130 '/%7Ejoe/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000131
132RobotTest(6, doc, good, bad)
133
134# From bug report #523041
135
136# 7.
137doc = """
138User-Agent: *
139Disallow: /.
140"""
141
142good = ['/foo.html']
143bad = [] # Bug report says "/" should be denied, but that is not in the RFC
144
145RobotTest(7, doc, good, bad)
146
Benjamin Petersond6313712008-07-31 16:23:04 +0000147# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
148
149# 8.
150doc = """
151User-agent: Googlebot
152Allow: /folder1/myfile.html
153Disallow: /folder1/
154"""
155
156good = ['/folder1/myfile.html']
157bad = ['/folder1/anotherfile.html']
158
159RobotTest(8, doc, good, bad, agent="Googlebot")
160
161# 9. This file is incorrect because "Googlebot" is a substring of
162# "Googlebot-Mobile", so test 10 works just like test 9.
163doc = """
164User-agent: Googlebot
165Disallow: /
166
167User-agent: Googlebot-Mobile
168Allow: /
169"""
170
171good = []
172bad = ['/something.jpg']
173
174RobotTest(9, doc, good, bad, agent="Googlebot")
175
176good = []
177bad = ['/something.jpg']
178
179RobotTest(10, doc, good, bad, agent="Googlebot-Mobile")
180
181# 11. Get the order correct.
182doc = """
183User-agent: Googlebot-Mobile
184Allow: /
185
186User-agent: Googlebot
187Disallow: /
188"""
189
190good = []
191bad = ['/something.jpg']
192
193RobotTest(11, doc, good, bad, agent="Googlebot")
194
195good = ['/something.jpg']
196bad = []
197
198RobotTest(12, doc, good, bad, agent="Googlebot-Mobile")
199
200
201# 13. Google also got the order wrong in #8. You need to specify the
202# URLs from more specific to more general.
203doc = """
204User-agent: Googlebot
205Allow: /folder1/myfile.html
206Disallow: /folder1/
207"""
208
209good = ['/folder1/myfile.html']
210bad = ['/folder1/anotherfile.html']
211
212RobotTest(13, doc, good, bad, agent="googlebot")
213
214
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000215# 14. For issue #6325 (query string support)
216doc = """
217User-agent: *
218Disallow: /some/path?name=value
219"""
220
221good = ['/some/path']
222bad = ['/some/path?name=value']
223
224RobotTest(14, doc, good, bad)
225
Georg Brandl0a0fc072010-07-29 17:55:01 +0000226# 15. For issue #4108 (obey first * entry)
227doc = """
228User-agent: *
229Disallow: /some/path
230
231User-agent: *
232Disallow: /another/path
233"""
234
235good = ['/another/path']
236bad = ['/some/path']
237
238RobotTest(15, doc, good, bad)
239
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700240# 16. Empty query (issue #17403). Normalizing the url first.
241doc = """
242User-agent: *
243Allow: /some/path?
244Disallow: /another/path?
245"""
246
247good = ['/some/path?']
248bad = ['/another/path?']
249
250RobotTest(16, doc, good, bad)
251
Benjamin Petersond6313712008-07-31 16:23:04 +0000252
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700253class RobotHandler(BaseHTTPRequestHandler):
254
255 def do_GET(self):
256 self.send_error(403, "Forbidden access")
257
258 def log_message(self, format, *args):
259 pass
260
261
262class PasswordProtectedSiteTestCase(unittest.TestCase):
263
264 def setUp(self):
265 self.server = HTTPServer((support.HOST, 0), RobotHandler)
266
267 self.t = threading.Thread(
268 name='HTTPServer serving',
269 target=self.server.serve_forever,
270 # Short poll interval to make the test finish quickly.
271 # Time between requests is short enough that we won't wake
272 # up spuriously too many times.
273 kwargs={'poll_interval':0.01})
274 self.t.daemon = True # In case this function raises.
275 self.t.start()
276
277 def tearDown(self):
278 self.server.shutdown()
279 self.t.join()
280 self.server.server_close()
281
282 def runTest(self):
283 self.testPasswordProtectedSite()
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000284
285 def testPasswordProtectedSite(self):
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700286 addr = self.server.server_address
287 url = 'http://' + support.HOST + ':' + str(addr[1])
288 robots_url = url + "/robots.txt"
289 parser = urllib.robotparser.RobotFileParser()
290 parser.set_url(url)
291 parser.read()
292 self.assertFalse(parser.can_fetch("*", robots_url))
293
294 def __str__(self):
295 return '%s' % self.__class__.__name__
296
297class NetworkTestCase(unittest.TestCase):
Thomas Wouters47b49bf2007-08-30 22:15:33 +0000298
Georg Brandl89e56712014-02-23 08:45:15 +0100299 @unittest.skip('does not handle the gzip encoding delivered by pydotorg')
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000300 def testPythonOrg(self):
Florent Xicluna41fe6152010-04-02 18:52:12 +0000301 support.requires('network')
Antoine Pitrou8bc09032010-09-07 21:09:09 +0000302 with support.transient_internet('www.python.org'):
303 parser = urllib.robotparser.RobotFileParser(
304 "http://www.python.org/robots.txt")
305 parser.read()
306 self.assertTrue(
307 parser.can_fetch("*", "http://www.python.org/robots.txt"))
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000308
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200309def load_tests(loader, suite, pattern):
310 suite = unittest.makeSuite(NetworkTestCase)
311 suite.addTest(tests)
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700312 suite.addTest(PasswordProtectedSiteTestCase())
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200313 return suite
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000314
315if __name__=='__main__':
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200316 unittest.main()