blob: d01266f330eba69a3dff57a6277a481bbf37773a [file] [log] [blame]
Guido van Rossum34d19282007-08-09 01:03:29 +00001import io
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002import unittest
3import urllib.robotparser
Antoine Pitrou95531ea2011-07-08 19:43:51 +02004from urllib.error import URLError, HTTPError
5from urllib.request import urlopen
Benjamin Petersonee8712c2008-05-20 21:35:26 +00006from test import support
Senthil Kumaran601d6ec2014-06-25 02:58:15 -07007from http.server import BaseHTTPRequestHandler, HTTPServer
Berker Peksagad324f62014-06-29 15:54:56 +03008try:
9 import threading
10except ImportError:
11 threading = None
Senthil Kumaran601d6ec2014-06-25 02:58:15 -070012
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000013
14class RobotTestCase(unittest.TestCase):
Ezio Melotti0fb37ea2013-03-12 07:49:12 +020015 def __init__(self, index=None, parser=None, url=None, good=None, agent=None):
16 # workaround to make unittest discovery work (see #17066)
17 if not isinstance(index, int):
18 return
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000019 unittest.TestCase.__init__(self)
20 if good:
21 self.str = "RobotTest(%d, good, %s)" % (index, url)
22 else:
23 self.str = "RobotTest(%d, bad, %s)" % (index, url)
24 self.parser = parser
25 self.url = url
26 self.good = good
27 self.agent = agent
28
29 def runTest(self):
30 if isinstance(self.url, tuple):
31 agent, url = self.url
32 else:
33 url = self.url
34 agent = self.agent
35 if self.good:
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000036 self.assertTrue(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000037 else:
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000038 self.assertFalse(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000039
40 def __str__(self):
41 return self.str
42
43tests = unittest.TestSuite()
44
45def RobotTest(index, robots_txt, good_urls, bad_urls,
46 agent="test_robotparser"):
Tim Peters863ac442002-04-16 01:38:40 +000047
Guido van Rossum34d19282007-08-09 01:03:29 +000048 lines = io.StringIO(robots_txt).readlines()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000049 parser = urllib.robotparser.RobotFileParser()
Tim Peters863ac442002-04-16 01:38:40 +000050 parser.parse(lines)
51 for url in good_urls:
52 tests.addTest(RobotTestCase(index, parser, url, 1, agent))
53 for url in bad_urls:
54 tests.addTest(RobotTestCase(index, parser, url, 0, agent))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000055
56# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
57
58# 1.
59doc = """
60User-agent: *
61Disallow: /cyberworld/map/ # This is an infinite virtual URL space
62Disallow: /tmp/ # these will soon disappear
63Disallow: /foo.html
64"""
65
66good = ['/','/test.html']
67bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
68
69RobotTest(1, doc, good, bad)
70
71# 2.
72doc = """
73# robots.txt for http://www.example.com/
74
75User-agent: *
76Disallow: /cyberworld/map/ # This is an infinite virtual URL space
77
78# Cybermapper knows where to go.
79User-agent: cybermapper
80Disallow:
81
82"""
83
84good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
85bad = ['/cyberworld/map/index.html']
86
87RobotTest(2, doc, good, bad)
88
89# 3.
90doc = """
91# go away
92User-agent: *
93Disallow: /
94"""
95
96good = []
97bad = ['/cyberworld/map/index.html','/','/tmp/']
98
99RobotTest(3, doc, good, bad)
100
101# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
102
103# 4.
104doc = """
105User-agent: figtree
106Disallow: /tmp
107Disallow: /a%3cd.html
108Disallow: /a%2fb.html
109Disallow: /%7ejoe/index.html
110"""
111
112good = [] # XFAIL '/a/b.html'
113bad = ['/tmp','/tmp.html','/tmp/a.html',
114 '/a%3cd.html','/a%3Cd.html','/a%2fb.html',
115 '/~joe/index.html'
116 ]
117
118RobotTest(4, doc, good, bad, 'figtree')
119RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
120
121# 6.
122doc = """
123User-agent: *
124Disallow: /tmp/
125Disallow: /a%3Cd.html
126Disallow: /a/b.html
127Disallow: /%7ejoe/index.html
128"""
129
130good = ['/tmp',] # XFAIL: '/a%2fb.html'
131bad = ['/tmp/','/tmp/a.html',
132 '/a%3cd.html','/a%3Cd.html',"/a/b.html",
Tim Peters863ac442002-04-16 01:38:40 +0000133 '/%7Ejoe/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000134
135RobotTest(6, doc, good, bad)
136
137# From bug report #523041
138
139# 7.
140doc = """
141User-Agent: *
142Disallow: /.
143"""
144
145good = ['/foo.html']
146bad = [] # Bug report says "/" should be denied, but that is not in the RFC
147
148RobotTest(7, doc, good, bad)
149
Benjamin Petersond6313712008-07-31 16:23:04 +0000150# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
151
152# 8.
153doc = """
154User-agent: Googlebot
155Allow: /folder1/myfile.html
156Disallow: /folder1/
157"""
158
159good = ['/folder1/myfile.html']
160bad = ['/folder1/anotherfile.html']
161
162RobotTest(8, doc, good, bad, agent="Googlebot")
163
164# 9. This file is incorrect because "Googlebot" is a substring of
165# "Googlebot-Mobile", so test 10 works just like test 9.
166doc = """
167User-agent: Googlebot
168Disallow: /
169
170User-agent: Googlebot-Mobile
171Allow: /
172"""
173
174good = []
175bad = ['/something.jpg']
176
177RobotTest(9, doc, good, bad, agent="Googlebot")
178
179good = []
180bad = ['/something.jpg']
181
182RobotTest(10, doc, good, bad, agent="Googlebot-Mobile")
183
184# 11. Get the order correct.
185doc = """
186User-agent: Googlebot-Mobile
187Allow: /
188
189User-agent: Googlebot
190Disallow: /
191"""
192
193good = []
194bad = ['/something.jpg']
195
196RobotTest(11, doc, good, bad, agent="Googlebot")
197
198good = ['/something.jpg']
199bad = []
200
201RobotTest(12, doc, good, bad, agent="Googlebot-Mobile")
202
203
204# 13. Google also got the order wrong in #8. You need to specify the
205# URLs from more specific to more general.
206doc = """
207User-agent: Googlebot
208Allow: /folder1/myfile.html
209Disallow: /folder1/
210"""
211
212good = ['/folder1/myfile.html']
213bad = ['/folder1/anotherfile.html']
214
215RobotTest(13, doc, good, bad, agent="googlebot")
216
217
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000218# 14. For issue #6325 (query string support)
219doc = """
220User-agent: *
221Disallow: /some/path?name=value
222"""
223
224good = ['/some/path']
225bad = ['/some/path?name=value']
226
227RobotTest(14, doc, good, bad)
228
Georg Brandl0a0fc072010-07-29 17:55:01 +0000229# 15. For issue #4108 (obey first * entry)
230doc = """
231User-agent: *
232Disallow: /some/path
233
234User-agent: *
235Disallow: /another/path
236"""
237
238good = ['/another/path']
239bad = ['/some/path']
240
241RobotTest(15, doc, good, bad)
242
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700243# 16. Empty query (issue #17403). Normalizing the url first.
244doc = """
245User-agent: *
246Allow: /some/path?
247Disallow: /another/path?
248"""
249
250good = ['/some/path?']
251bad = ['/another/path?']
252
253RobotTest(16, doc, good, bad)
254
Benjamin Petersond6313712008-07-31 16:23:04 +0000255
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700256class RobotHandler(BaseHTTPRequestHandler):
257
258 def do_GET(self):
259 self.send_error(403, "Forbidden access")
260
261 def log_message(self, format, *args):
262 pass
263
264
Berker Peksagad324f62014-06-29 15:54:56 +0300265@unittest.skipUnless(threading, 'threading required for this test')
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700266class PasswordProtectedSiteTestCase(unittest.TestCase):
267
268 def setUp(self):
269 self.server = HTTPServer((support.HOST, 0), RobotHandler)
270
271 self.t = threading.Thread(
272 name='HTTPServer serving',
273 target=self.server.serve_forever,
274 # Short poll interval to make the test finish quickly.
275 # Time between requests is short enough that we won't wake
276 # up spuriously too many times.
277 kwargs={'poll_interval':0.01})
278 self.t.daemon = True # In case this function raises.
279 self.t.start()
280
281 def tearDown(self):
282 self.server.shutdown()
283 self.t.join()
284 self.server.server_close()
285
286 def runTest(self):
287 self.testPasswordProtectedSite()
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000288
289 def testPasswordProtectedSite(self):
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700290 addr = self.server.server_address
291 url = 'http://' + support.HOST + ':' + str(addr[1])
292 robots_url = url + "/robots.txt"
293 parser = urllib.robotparser.RobotFileParser()
294 parser.set_url(url)
295 parser.read()
296 self.assertFalse(parser.can_fetch("*", robots_url))
297
298 def __str__(self):
299 return '%s' % self.__class__.__name__
300
301class NetworkTestCase(unittest.TestCase):
Thomas Wouters47b49bf2007-08-30 22:15:33 +0000302
Georg Brandl89e56712014-02-23 08:45:15 +0100303 @unittest.skip('does not handle the gzip encoding delivered by pydotorg')
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000304 def testPythonOrg(self):
Florent Xicluna41fe6152010-04-02 18:52:12 +0000305 support.requires('network')
Antoine Pitrou8bc09032010-09-07 21:09:09 +0000306 with support.transient_internet('www.python.org'):
307 parser = urllib.robotparser.RobotFileParser(
308 "http://www.python.org/robots.txt")
309 parser.read()
310 self.assertTrue(
311 parser.can_fetch("*", "http://www.python.org/robots.txt"))
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000312
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200313def load_tests(loader, suite, pattern):
314 suite = unittest.makeSuite(NetworkTestCase)
315 suite.addTest(tests)
Senthil Kumaran601d6ec2014-06-25 02:58:15 -0700316 suite.addTest(PasswordProtectedSiteTestCase())
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200317 return suite
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000318
319if __name__=='__main__':
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200320 unittest.main()