blob: d1dfd9eeec026d4d501b1debca900bef6be1e50c [file] [log] [blame]
Guido van Rossum34d19282007-08-09 01:03:29 +00001import io
Jeremy Hylton1afc1692008-06-18 20:49:58 +00002import unittest
3import urllib.robotparser
Antoine Pitrou95531ea2011-07-08 19:43:51 +02004from urllib.error import URLError, HTTPError
5from urllib.request import urlopen
Benjamin Petersonee8712c2008-05-20 21:35:26 +00006from test import support
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +00007
8class RobotTestCase(unittest.TestCase):
Ezio Melotti0fb37ea2013-03-12 07:49:12 +02009 def __init__(self, index=None, parser=None, url=None, good=None, agent=None):
10 # workaround to make unittest discovery work (see #17066)
11 if not isinstance(index, int):
12 return
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000013 unittest.TestCase.__init__(self)
14 if good:
15 self.str = "RobotTest(%d, good, %s)" % (index, url)
16 else:
17 self.str = "RobotTest(%d, bad, %s)" % (index, url)
18 self.parser = parser
19 self.url = url
20 self.good = good
21 self.agent = agent
22
23 def runTest(self):
24 if isinstance(self.url, tuple):
25 agent, url = self.url
26 else:
27 url = self.url
28 agent = self.agent
29 if self.good:
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000030 self.assertTrue(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000031 else:
Benjamin Petersonc9c0f202009-06-30 23:06:06 +000032 self.assertFalse(self.parser.can_fetch(agent, url))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000033
34 def __str__(self):
35 return self.str
36
37tests = unittest.TestSuite()
38
39def RobotTest(index, robots_txt, good_urls, bad_urls,
40 agent="test_robotparser"):
Tim Peters863ac442002-04-16 01:38:40 +000041
Guido van Rossum34d19282007-08-09 01:03:29 +000042 lines = io.StringIO(robots_txt).readlines()
Jeremy Hylton1afc1692008-06-18 20:49:58 +000043 parser = urllib.robotparser.RobotFileParser()
Tim Peters863ac442002-04-16 01:38:40 +000044 parser.parse(lines)
45 for url in good_urls:
46 tests.addTest(RobotTestCase(index, parser, url, 1, agent))
47 for url in bad_urls:
48 tests.addTest(RobotTestCase(index, parser, url, 0, agent))
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000049
50# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
51
52# 1.
53doc = """
54User-agent: *
55Disallow: /cyberworld/map/ # This is an infinite virtual URL space
56Disallow: /tmp/ # these will soon disappear
57Disallow: /foo.html
58"""
59
60good = ['/','/test.html']
61bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
62
63RobotTest(1, doc, good, bad)
64
65# 2.
66doc = """
67# robots.txt for http://www.example.com/
68
69User-agent: *
70Disallow: /cyberworld/map/ # This is an infinite virtual URL space
71
72# Cybermapper knows where to go.
73User-agent: cybermapper
74Disallow:
75
76"""
77
78good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
79bad = ['/cyberworld/map/index.html']
80
81RobotTest(2, doc, good, bad)
82
83# 3.
84doc = """
85# go away
86User-agent: *
87Disallow: /
88"""
89
90good = []
91bad = ['/cyberworld/map/index.html','/','/tmp/']
92
93RobotTest(3, doc, good, bad)
94
95# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
96
97# 4.
98doc = """
99User-agent: figtree
100Disallow: /tmp
101Disallow: /a%3cd.html
102Disallow: /a%2fb.html
103Disallow: /%7ejoe/index.html
104"""
105
106good = [] # XFAIL '/a/b.html'
107bad = ['/tmp','/tmp.html','/tmp/a.html',
108 '/a%3cd.html','/a%3Cd.html','/a%2fb.html',
109 '/~joe/index.html'
110 ]
111
112RobotTest(4, doc, good, bad, 'figtree')
113RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
114
115# 6.
116doc = """
117User-agent: *
118Disallow: /tmp/
119Disallow: /a%3Cd.html
120Disallow: /a/b.html
121Disallow: /%7ejoe/index.html
122"""
123
124good = ['/tmp',] # XFAIL: '/a%2fb.html'
125bad = ['/tmp/','/tmp/a.html',
126 '/a%3cd.html','/a%3Cd.html',"/a/b.html",
Tim Peters863ac442002-04-16 01:38:40 +0000127 '/%7Ejoe/index.html']
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000128
129RobotTest(6, doc, good, bad)
130
131# From bug report #523041
132
133# 7.
134doc = """
135User-Agent: *
136Disallow: /.
137"""
138
139good = ['/foo.html']
140bad = [] # Bug report says "/" should be denied, but that is not in the RFC
141
142RobotTest(7, doc, good, bad)
143
Benjamin Petersond6313712008-07-31 16:23:04 +0000144# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
145
146# 8.
147doc = """
148User-agent: Googlebot
149Allow: /folder1/myfile.html
150Disallow: /folder1/
151"""
152
153good = ['/folder1/myfile.html']
154bad = ['/folder1/anotherfile.html']
155
156RobotTest(8, doc, good, bad, agent="Googlebot")
157
158# 9. This file is incorrect because "Googlebot" is a substring of
159# "Googlebot-Mobile", so test 10 works just like test 9.
160doc = """
161User-agent: Googlebot
162Disallow: /
163
164User-agent: Googlebot-Mobile
165Allow: /
166"""
167
168good = []
169bad = ['/something.jpg']
170
171RobotTest(9, doc, good, bad, agent="Googlebot")
172
173good = []
174bad = ['/something.jpg']
175
176RobotTest(10, doc, good, bad, agent="Googlebot-Mobile")
177
178# 11. Get the order correct.
179doc = """
180User-agent: Googlebot-Mobile
181Allow: /
182
183User-agent: Googlebot
184Disallow: /
185"""
186
187good = []
188bad = ['/something.jpg']
189
190RobotTest(11, doc, good, bad, agent="Googlebot")
191
192good = ['/something.jpg']
193bad = []
194
195RobotTest(12, doc, good, bad, agent="Googlebot-Mobile")
196
197
198# 13. Google also got the order wrong in #8. You need to specify the
199# URLs from more specific to more general.
200doc = """
201User-agent: Googlebot
202Allow: /folder1/myfile.html
203Disallow: /folder1/
204"""
205
206good = ['/folder1/myfile.html']
207bad = ['/folder1/anotherfile.html']
208
209RobotTest(13, doc, good, bad, agent="googlebot")
210
211
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000212# 14. For issue #6325 (query string support)
213doc = """
214User-agent: *
215Disallow: /some/path?name=value
216"""
217
218good = ['/some/path']
219bad = ['/some/path?name=value']
220
221RobotTest(14, doc, good, bad)
222
Georg Brandl0a0fc072010-07-29 17:55:01 +0000223# 15. For issue #4108 (obey first * entry)
224doc = """
225User-agent: *
226Disallow: /some/path
227
228User-agent: *
229Disallow: /another/path
230"""
231
232good = ['/another/path']
233bad = ['/some/path']
234
235RobotTest(15, doc, good, bad)
236
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700237# 16. Empty query (issue #17403). Normalizing the url first.
238doc = """
239User-agent: *
240Allow: /some/path?
241Disallow: /another/path?
242"""
243
244good = ['/some/path?']
245bad = ['/another/path?']
246
247RobotTest(16, doc, good, bad)
248
Benjamin Petersond6313712008-07-31 16:23:04 +0000249
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000250class NetworkTestCase(unittest.TestCase):
251
252 def testPasswordProtectedSite(self):
Florent Xicluna41fe6152010-04-02 18:52:12 +0000253 support.requires('network')
Antoine Pitrou8bc09032010-09-07 21:09:09 +0000254 with support.transient_internet('mueblesmoraleda.com'):
255 url = 'http://mueblesmoraleda.com'
Antoine Pitrou95531ea2011-07-08 19:43:51 +0200256 robots_url = url + "/robots.txt"
257 # First check the URL is usable for our purposes, since the
258 # test site is a bit flaky.
259 try:
260 urlopen(robots_url)
261 except HTTPError as e:
262 if e.code not in {401, 403}:
263 self.skipTest(
264 "%r should return a 401 or 403 HTTP error, not %r"
265 % (robots_url, e.code))
266 else:
267 self.skipTest(
268 "%r should return a 401 or 403 HTTP error, not succeed"
269 % (robots_url))
Antoine Pitrou8bc09032010-09-07 21:09:09 +0000270 parser = urllib.robotparser.RobotFileParser()
271 parser.set_url(url)
272 try:
273 parser.read()
274 except URLError:
275 self.skipTest('%s is unavailable' % url)
Antoine Pitrou95531ea2011-07-08 19:43:51 +0200276 self.assertEqual(parser.can_fetch("*", robots_url), False)
Thomas Wouters47b49bf2007-08-30 22:15:33 +0000277
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000278 def testPythonOrg(self):
Florent Xicluna41fe6152010-04-02 18:52:12 +0000279 support.requires('network')
Antoine Pitrou8bc09032010-09-07 21:09:09 +0000280 with support.transient_internet('www.python.org'):
281 parser = urllib.robotparser.RobotFileParser(
282 "http://www.python.org/robots.txt")
283 parser.read()
284 self.assertTrue(
285 parser.can_fetch("*", "http://www.python.org/robots.txt"))
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000286
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200287def load_tests(loader, suite, pattern):
288 suite = unittest.makeSuite(NetworkTestCase)
289 suite.addTest(tests)
290 return suite
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000291
292if __name__=='__main__':
Ezio Melotti0fb37ea2013-03-12 07:49:12 +0200293 support.use_resources = ['network']
294 unittest.main()