blob: 36ac941ab8780b6e0271ef93f5b94f920089dc74 [file] [log] [blame]
Barry Warsaw04f357c2002-07-23 19:04:11 +00001import unittest, StringIO, robotparser
2from test import test_support
Antoine Pitrou29603082011-07-08 19:40:15 +02003from urllib2 import urlopen, HTTPError
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +00004
Ned Deilyc7275332014-03-26 23:25:02 -07005HAVE_HTTPS = True
6try:
7 from urllib2 import HTTPSHandler
8except ImportError:
9 HAVE_HTTPS = False
10
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000011class RobotTestCase(unittest.TestCase):
12 def __init__(self, index, parser, url, good, agent):
13 unittest.TestCase.__init__(self)
14 if good:
15 self.str = "RobotTest(%d, good, %s)" % (index, url)
16 else:
17 self.str = "RobotTest(%d, bad, %s)" % (index, url)
18 self.parser = parser
19 self.url = url
20 self.good = good
21 self.agent = agent
22
23 def runTest(self):
24 if isinstance(self.url, tuple):
25 agent, url = self.url
26 else:
27 url = self.url
28 agent = self.agent
29 if self.good:
Benjamin Peterson5c8da862009-06-30 22:57:08 +000030 self.assertTrue(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000031 else:
Benjamin Peterson5c8da862009-06-30 22:57:08 +000032 self.assertFalse(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000033
34 def __str__(self):
35 return self.str
36
37tests = unittest.TestSuite()
38
39def RobotTest(index, robots_txt, good_urls, bad_urls,
40 agent="test_robotparser"):
Tim Peters863ac442002-04-16 01:38:40 +000041
42 lines = StringIO.StringIO(robots_txt).readlines()
43 parser = robotparser.RobotFileParser()
44 parser.parse(lines)
45 for url in good_urls:
46 tests.addTest(RobotTestCase(index, parser, url, 1, agent))
47 for url in bad_urls:
48 tests.addTest(RobotTestCase(index, parser, url, 0, agent))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000049
50# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
51
52# 1.
53doc = """
54User-agent: *
55Disallow: /cyberworld/map/ # This is an infinite virtual URL space
56Disallow: /tmp/ # these will soon disappear
57Disallow: /foo.html
58"""
59
60good = ['/','/test.html']
61bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
62
63RobotTest(1, doc, good, bad)
64
65# 2.
66doc = """
67# robots.txt for http://www.example.com/
68
69User-agent: *
70Disallow: /cyberworld/map/ # This is an infinite virtual URL space
71
72# Cybermapper knows where to go.
73User-agent: cybermapper
74Disallow:
75
76"""
77
78good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
79bad = ['/cyberworld/map/index.html']
80
81RobotTest(2, doc, good, bad)
82
83# 3.
84doc = """
85# go away
86User-agent: *
87Disallow: /
88"""
89
90good = []
91bad = ['/cyberworld/map/index.html','/','/tmp/']
92
93RobotTest(3, doc, good, bad)
94
95# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
96
97# 4.
98doc = """
99User-agent: figtree
100Disallow: /tmp
101Disallow: /a%3cd.html
102Disallow: /a%2fb.html
103Disallow: /%7ejoe/index.html
104"""
105
106good = [] # XFAIL '/a/b.html'
107bad = ['/tmp','/tmp.html','/tmp/a.html',
108 '/a%3cd.html','/a%3Cd.html','/a%2fb.html',
109 '/~joe/index.html'
110 ]
111
112RobotTest(4, doc, good, bad, 'figtree')
113RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
114
115# 6.
116doc = """
117User-agent: *
118Disallow: /tmp/
119Disallow: /a%3Cd.html
120Disallow: /a/b.html
121Disallow: /%7ejoe/index.html
122"""
123
124good = ['/tmp',] # XFAIL: '/a%2fb.html'
125bad = ['/tmp/','/tmp/a.html',
126 '/a%3cd.html','/a%3Cd.html',"/a/b.html",
Tim Peters863ac442002-04-16 01:38:40 +0000127 '/%7Ejoe/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000128
129RobotTest(6, doc, good, bad)
130
131# From bug report #523041
132
133# 7.
134doc = """
135User-Agent: *
136Disallow: /.
137"""
138
139good = ['/foo.html']
140bad = [] # Bug report says "/" should be denied, but that is not in the RFC
141
142RobotTest(7, doc, good, bad)
143
Skip Montanaro1ef19f02008-07-27 00:49:02 +0000144# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
145
146# 8.
147doc = """
148User-agent: Googlebot
149Allow: /folder1/myfile.html
150Disallow: /folder1/
151"""
152
153good = ['/folder1/myfile.html']
154bad = ['/folder1/anotherfile.html']
155
156RobotTest(8, doc, good, bad, agent="Googlebot")
157
158# 9. This file is incorrect because "Googlebot" is a substring of
159# "Googlebot-Mobile", so test 10 works just like test 9.
160doc = """
161User-agent: Googlebot
162Disallow: /
163
164User-agent: Googlebot-Mobile
165Allow: /
166"""
167
168good = []
169bad = ['/something.jpg']
170
171RobotTest(9, doc, good, bad, agent="Googlebot")
172
173good = []
174bad = ['/something.jpg']
175
176RobotTest(10, doc, good, bad, agent="Googlebot-Mobile")
177
178# 11. Get the order correct.
179doc = """
180User-agent: Googlebot-Mobile
181Allow: /
182
183User-agent: Googlebot
184Disallow: /
185"""
186
187good = []
188bad = ['/something.jpg']
189
190RobotTest(11, doc, good, bad, agent="Googlebot")
191
192good = ['/something.jpg']
193bad = []
194
195RobotTest(12, doc, good, bad, agent="Googlebot-Mobile")
196
197
198# 13. Google also got the order wrong in #8. You need to specify the
199# URLs from more specific to more general.
200doc = """
201User-agent: Googlebot
202Allow: /folder1/myfile.html
203Disallow: /folder1/
204"""
205
206good = ['/folder1/myfile.html']
207bad = ['/folder1/anotherfile.html']
208
209RobotTest(13, doc, good, bad, agent="googlebot")
210
211
Senthil Kumarana4f79f92010-07-28 16:35:35 +0000212# 14. For issue #6325 (query string support)
213doc = """
214User-agent: *
215Disallow: /some/path?name=value
216"""
217
218good = ['/some/path']
219bad = ['/some/path?name=value']
220
221RobotTest(14, doc, good, bad)
222
Georg Brandl2bd953e2010-08-01 20:59:03 +0000223# 15. For issue #4108 (obey first * entry)
224doc = """
225User-agent: *
226Disallow: /some/path
227
228User-agent: *
229Disallow: /another/path
230"""
231
232good = ['/another/path']
233bad = ['/some/path']
234
235RobotTest(15, doc, good, bad)
236
Senthil Kumaran2c4810e2013-05-29 05:58:47 -0700237# 16. Empty query (issue #17403). Normalizing the url first.
238doc = """
239User-agent: *
240Allow: /some/path?
241Disallow: /another/path?
242"""
243
244good = ['/some/path?']
245bad = ['/another/path?']
246
247RobotTest(16, doc, good, bad)
248
Skip Montanaro1ef19f02008-07-27 00:49:02 +0000249
Florent Xiclunaf37592f2010-04-02 17:26:42 +0000250class NetworkTestCase(unittest.TestCase):
251
252 def testPasswordProtectedSite(self):
Skip Montanaro1a413132007-08-28 23:22:52 +0000253 test_support.requires('network')
Antoine Pitrouc818ed42010-09-07 21:40:25 +0000254 with test_support.transient_internet('mueblesmoraleda.com'):
255 url = 'http://mueblesmoraleda.com'
Antoine Pitrou29603082011-07-08 19:40:15 +0200256 robots_url = url + "/robots.txt"
257 # First check the URL is usable for our purposes, since the
258 # test site is a bit flaky.
259 try:
260 urlopen(robots_url)
261 except HTTPError as e:
262 if e.code not in {401, 403}:
263 self.skipTest(
264 "%r should return a 401 or 403 HTTP error, not %r"
265 % (robots_url, e.code))
266 else:
267 self.skipTest(
268 "%r should return a 401 or 403 HTTP error, not succeed"
269 % (robots_url))
Antoine Pitrouc818ed42010-09-07 21:40:25 +0000270 parser = robotparser.RobotFileParser()
271 parser.set_url(url)
272 try:
273 parser.read()
274 except IOError:
275 self.skipTest('%s is unavailable' % url)
Antoine Pitrou29603082011-07-08 19:40:15 +0200276 self.assertEqual(parser.can_fetch("*", robots_url), False)
Skip Montanaro1a413132007-08-28 23:22:52 +0000277
Ned Deilyc7275332014-03-26 23:25:02 -0700278 @unittest.skipUnless(HAVE_HTTPS, 'need SSL support to download license')
Florent Xiclunaf37592f2010-04-02 17:26:42 +0000279 def testPythonOrg(self):
280 test_support.requires('network')
Antoine Pitrouc818ed42010-09-07 21:40:25 +0000281 with test_support.transient_internet('www.python.org'):
282 parser = robotparser.RobotFileParser(
283 "http://www.python.org/robots.txt")
284 parser.read()
285 self.assertTrue(
286 parser.can_fetch("*", "http://www.python.org/robots.txt"))
Florent Xiclunaf37592f2010-04-02 17:26:42 +0000287
288
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000289def test_main():
Collin Winterc2898c52007-04-25 17:29:52 +0000290 test_support.run_unittest(tests)
Florent Xiclunaf37592f2010-04-02 17:26:42 +0000291 test_support.run_unittest(NetworkTestCase)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000292
293if __name__=='__main__':
Georg Brandl730c8182008-07-18 10:29:30 +0000294 test_support.verbose = 1
Collin Winterc2898c52007-04-25 17:29:52 +0000295 test_main()