blob: 178761dd381b23aadbd419b2503974919853645d [file] [log] [blame]
Guido van Rossum34d19282007-08-09 01:03:29 +00001import io
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002import unittest
3import urllib.robotparser
Antoine Pitrou95531ea2011-07-08 19:43:51 +02004from urllib.error import URLError, HTTPError
5from urllib.request import urlopen
Benjamin Petersonee8712c2008-05-20 21:35:26 +00006from test import support
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +00007
8class RobotTestCase(unittest.TestCase):
9 def __init__(self, index, parser, url, good, agent):
10 unittest.TestCase.__init__(self)
11 if good:
12 self.str = "RobotTest(%d, good, %s)" % (index, url)
13 else:
14 self.str = "RobotTest(%d, bad, %s)" % (index, url)
15 self.parser = parser
16 self.url = url
17 self.good = good
18 self.agent = agent
19
20 def runTest(self):
21 if isinstance(self.url, tuple):
22 agent, url = self.url
23 else:
24 url = self.url
25 agent = self.agent
26 if self.good:
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000027 self.assertTrue(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000028 else:
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000029 self.assertFalse(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000030
31 def __str__(self):
32 return self.str
33
34tests = unittest.TestSuite()
35
36def RobotTest(index, robots_txt, good_urls, bad_urls,
37 agent="test_robotparser"):
Tim Peters863ac442002-04-16 01:38:40 +000038
Guido van Rossum34d19282007-08-09 01:03:29 +000039 lines = io.StringIO(robots_txt).readlines()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000040 parser = urllib.robotparser.RobotFileParser()
Tim Peters863ac442002-04-16 01:38:40 +000041 parser.parse(lines)
42 for url in good_urls:
43 tests.addTest(RobotTestCase(index, parser, url, 1, agent))
44 for url in bad_urls:
45 tests.addTest(RobotTestCase(index, parser, url, 0, agent))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000046
47# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
48
49# 1.
50doc = """
51User-agent: *
52Disallow: /cyberworld/map/ # This is an infinite virtual URL space
53Disallow: /tmp/ # these will soon disappear
54Disallow: /foo.html
55"""
56
57good = ['/','/test.html']
58bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
59
60RobotTest(1, doc, good, bad)
61
62# 2.
63doc = """
64# robots.txt for http://www.example.com/
65
66User-agent: *
67Disallow: /cyberworld/map/ # This is an infinite virtual URL space
68
69# Cybermapper knows where to go.
70User-agent: cybermapper
71Disallow:
72
73"""
74
75good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
76bad = ['/cyberworld/map/index.html']
77
78RobotTest(2, doc, good, bad)
79
80# 3.
81doc = """
82# go away
83User-agent: *
84Disallow: /
85"""
86
87good = []
88bad = ['/cyberworld/map/index.html','/','/tmp/']
89
90RobotTest(3, doc, good, bad)
91
92# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
93
94# 4.
95doc = """
96User-agent: figtree
97Disallow: /tmp
98Disallow: /a%3cd.html
99Disallow: /a%2fb.html
100Disallow: /%7ejoe/index.html
101"""
102
103good = [] # XFAIL '/a/b.html'
104bad = ['/tmp','/tmp.html','/tmp/a.html',
105 '/a%3cd.html','/a%3Cd.html','/a%2fb.html',
106 '/~joe/index.html'
107 ]
108
109RobotTest(4, doc, good, bad, 'figtree')
110RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
111
112# 6.
113doc = """
114User-agent: *
115Disallow: /tmp/
116Disallow: /a%3Cd.html
117Disallow: /a/b.html
118Disallow: /%7ejoe/index.html
119"""
120
121good = ['/tmp',] # XFAIL: '/a%2fb.html'
122bad = ['/tmp/','/tmp/a.html',
123 '/a%3cd.html','/a%3Cd.html',"/a/b.html",
Tim Peters863ac442002-04-16 01:38:40 +0000124 '/%7Ejoe/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000125
126RobotTest(6, doc, good, bad)
127
128# From bug report #523041
129
130# 7.
131doc = """
132User-Agent: *
133Disallow: /.
134"""
135
136good = ['/foo.html']
137bad = [] # Bug report says "/" should be denied, but that is not in the RFC
138
139RobotTest(7, doc, good, bad)
140
Benjamin Petersond6313712008-07-31 16:23:04 +0000141# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
142
143# 8.
144doc = """
145User-agent: Googlebot
146Allow: /folder1/myfile.html
147Disallow: /folder1/
148"""
149
150good = ['/folder1/myfile.html']
151bad = ['/folder1/anotherfile.html']
152
153RobotTest(8, doc, good, bad, agent="Googlebot")
154
155# 9. This file is incorrect because "Googlebot" is a substring of
156# "Googlebot-Mobile", so test 10 works just like test 9.
157doc = """
158User-agent: Googlebot
159Disallow: /
160
161User-agent: Googlebot-Mobile
162Allow: /
163"""
164
165good = []
166bad = ['/something.jpg']
167
168RobotTest(9, doc, good, bad, agent="Googlebot")
169
170good = []
171bad = ['/something.jpg']
172
173RobotTest(10, doc, good, bad, agent="Googlebot-Mobile")
174
175# 11. Get the order correct.
176doc = """
177User-agent: Googlebot-Mobile
178Allow: /
179
180User-agent: Googlebot
181Disallow: /
182"""
183
184good = []
185bad = ['/something.jpg']
186
187RobotTest(11, doc, good, bad, agent="Googlebot")
188
189good = ['/something.jpg']
190bad = []
191
192RobotTest(12, doc, good, bad, agent="Googlebot-Mobile")
193
194
195# 13. Google also got the order wrong in #8. You need to specify the
196# URLs from more specific to more general.
197doc = """
198User-agent: Googlebot
199Allow: /folder1/myfile.html
200Disallow: /folder1/
201"""
202
203good = ['/folder1/myfile.html']
204bad = ['/folder1/anotherfile.html']
205
206RobotTest(13, doc, good, bad, agent="googlebot")
207
208
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000209# 14. For issue #6325 (query string support)
210doc = """
211User-agent: *
212Disallow: /some/path?name=value
213"""
214
215good = ['/some/path']
216bad = ['/some/path?name=value']
217
218RobotTest(14, doc, good, bad)
219
Georg Brandl0a0fc072010-07-29 17:55:01 +0000220# 15. For issue #4108 (obey first * entry)
221doc = """
222User-agent: *
223Disallow: /some/path
224
225User-agent: *
226Disallow: /another/path
227"""
228
229good = ['/another/path']
230bad = ['/some/path']
231
232RobotTest(15, doc, good, bad)
233
Benjamin Petersond6313712008-07-31 16:23:04 +0000234
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000235class NetworkTestCase(unittest.TestCase):
236
237 def testPasswordProtectedSite(self):
Florent Xicluna41fe6152010-04-02 18:52:12 +0000238 support.requires('network')
Antoine Pitrou8bc09032010-09-07 21:09:09 +0000239 with support.transient_internet('mueblesmoraleda.com'):
240 url = 'http://mueblesmoraleda.com'
Antoine Pitrou95531ea2011-07-08 19:43:51 +0200241 robots_url = url + "/robots.txt"
242 # First check the URL is usable for our purposes, since the
243 # test site is a bit flaky.
244 try:
245 urlopen(robots_url)
246 except HTTPError as e:
247 if e.code not in {401, 403}:
248 self.skipTest(
249 "%r should return a 401 or 403 HTTP error, not %r"
250 % (robots_url, e.code))
251 else:
252 self.skipTest(
253 "%r should return a 401 or 403 HTTP error, not succeed"
254 % (robots_url))
Antoine Pitrou8bc09032010-09-07 21:09:09 +0000255 parser = urllib.robotparser.RobotFileParser()
256 parser.set_url(url)
257 try:
258 parser.read()
259 except URLError:
260 self.skipTest('%s is unavailable' % url)
Antoine Pitrou95531ea2011-07-08 19:43:51 +0200261 self.assertEqual(parser.can_fetch("*", robots_url), False)
Thomas Wouters47b49bf2007-08-30 22:15:33 +0000262
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000263 def testPythonOrg(self):
Florent Xicluna41fe6152010-04-02 18:52:12 +0000264 support.requires('network')
Antoine Pitrou8bc09032010-09-07 21:09:09 +0000265 with support.transient_internet('www.python.org'):
266 parser = urllib.robotparser.RobotFileParser(
267 "http://www.python.org/robots.txt")
268 parser.read()
269 self.assertTrue(
270 parser.can_fetch("*", "http://www.python.org/robots.txt"))
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000271
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000272def test_main():
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000273 support.run_unittest(NetworkTestCase)
Benjamin Petersonee8712c2008-05-20 21:35:26 +0000274 support.run_unittest(tests)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000275
276if __name__=='__main__':
Georg Brandl3dbca812008-07-23 16:10:53 +0000277 support.verbose = 1
Guido van Rossumd8faa362007-04-27 19:54:29 +0000278 test_main()