blob: c5943d05ae05f25bf2e263515b6b41a44e8228ec [file] [log] [blame]
Skip Montanaro663f6c22001-01-20 15:59:25 +00001""" robotparser.py
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +00002
Skip Montanaro663f6c22001-01-20 15:59:25 +00003 Copyright (C) 2000 Bastian Kleineidam
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +00004
Skip Montanaro663f6c22001-01-20 15:59:25 +00005 You can choose between two licenses when using this package:
6 1) GNU GPLv2
7 2) PYTHON 2.0 OPEN SOURCE LICENSE
8
9 The robots.txt Exclusion Protocol is implemented as specified in
10 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000011"""
Eric S. Raymond141971f2001-02-09 08:40:40 +000012import re,urlparse,urllib
Skip Montanaro663f6c22001-01-20 15:59:25 +000013
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000014__all__ = ["RobotFileParser"]
15
Skip Montanaro663f6c22001-01-20 15:59:25 +000016debug = 0
17
18def _debug(msg):
19 if debug: print msg
20
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000021
22class RobotFileParser:
Skip Montanaro663f6c22001-01-20 15:59:25 +000023 def __init__(self, url=''):
24 self.entries = []
25 self.disallow_all = 0
26 self.allow_all = 0
27 self.set_url(url)
Guido van Rossum986abac1998-04-06 14:29:28 +000028 self.last_checked = 0
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000029
30 def mtime(self):
Guido van Rossum986abac1998-04-06 14:29:28 +000031 return self.last_checked
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000032
33 def modified(self):
Guido van Rossum986abac1998-04-06 14:29:28 +000034 import time
35 self.last_checked = time.time()
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000036
37 def set_url(self, url):
Guido van Rossum986abac1998-04-06 14:29:28 +000038 self.url = url
Skip Montanaro663f6c22001-01-20 15:59:25 +000039 self.host, self.path = urlparse.urlparse(url)[1:3]
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000040
41 def read(self):
Skip Montanaro5bba2312001-02-12 20:58:30 +000042 opener = URLopener()
43 f = opener.open(self.url)
44 lines = f.readlines()
45 self.errcode = opener.errcode
46 if self.errcode == 401 or self.errcode == 403:
Skip Montanaro663f6c22001-01-20 15:59:25 +000047 self.disallow_all = 1
Skip Montanaro5bba2312001-02-12 20:58:30 +000048 _debug("disallow all")
49 elif self.errcode >= 400:
Skip Montanaro663f6c22001-01-20 15:59:25 +000050 self.allow_all = 1
Skip Montanaro5bba2312001-02-12 20:58:30 +000051 _debug("allow all")
52 elif self.errcode == 200 and lines:
53 _debug("parse lines")
54 self.parse(lines)
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000055
56 def parse(self, lines):
Skip Montanaro663f6c22001-01-20 15:59:25 +000057 """parse the input lines from a robot.txt file.
Tim Petersdfc538a2001-01-21 04:49:16 +000058 We allow that a user-agent: line is not preceded by
59 one or more blank lines."""
Skip Montanaro663f6c22001-01-20 15:59:25 +000060 state = 0
61 linenumber = 0
62 entry = Entry()
Tim Petersdfc538a2001-01-21 04:49:16 +000063
Guido van Rossum986abac1998-04-06 14:29:28 +000064 for line in lines:
Eric S. Raymond141971f2001-02-09 08:40:40 +000065 line = line.strip()
Skip Montanaro663f6c22001-01-20 15:59:25 +000066 linenumber = linenumber + 1
67 if not line:
68 if state==1:
69 _debug("line %d: warning: you should insert"
Tim Petersdfc538a2001-01-21 04:49:16 +000070 " allow: or disallow: directives below any"
71 " user-agent: line" % linenumber)
Skip Montanaro663f6c22001-01-20 15:59:25 +000072 entry = Entry()
73 state = 0
74 elif state==2:
75 self.entries.append(entry)
76 entry = Entry()
77 state = 0
Guido van Rossum986abac1998-04-06 14:29:28 +000078 # remove optional comment and strip line
Eric S. Raymond141971f2001-02-09 08:40:40 +000079 i = line.find('#')
Skip Montanaro663f6c22001-01-20 15:59:25 +000080 if i>=0:
81 line = line[:i]
Eric S. Raymond141971f2001-02-09 08:40:40 +000082 line = line.strip()
Guido van Rossum986abac1998-04-06 14:29:28 +000083 if not line:
84 continue
Eric S. Raymond141971f2001-02-09 08:40:40 +000085 line = line.split(':', 1)
Guido van Rossum986abac1998-04-06 14:29:28 +000086 if len(line) == 2:
Eric S. Raymond141971f2001-02-09 08:40:40 +000087 line[0] = line[0].strip().lower()
88 line[1] = line[1].strip()
Skip Montanaro663f6c22001-01-20 15:59:25 +000089 if line[0] == "user-agent":
90 if state==2:
91 _debug("line %d: warning: you should insert a blank"
Tim Petersdfc538a2001-01-21 04:49:16 +000092 " line before any user-agent"
Skip Montanaro663f6c22001-01-20 15:59:25 +000093 " directive" % linenumber)
94 self.entries.append(entry)
95 entry = Entry()
96 entry.useragents.append(line[1])
97 state = 1
98 elif line[0] == "disallow":
99 if state==0:
100 _debug("line %d: error: you must insert a user-agent:"
Tim Petersdfc538a2001-01-21 04:49:16 +0000101 " directive before this line" % linenumber)
Guido van Rossum986abac1998-04-06 14:29:28 +0000102 else:
Skip Montanaro663f6c22001-01-20 15:59:25 +0000103 entry.rulelines.append(RuleLine(line[1], 0))
104 state = 2
105 elif line[0] == "allow":
106 if state==0:
107 _debug("line %d: error: you must insert a user-agent:"
Tim Petersdfc538a2001-01-21 04:49:16 +0000108 " directive before this line" % linenumber)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000109 else:
110 entry.rulelines.append(RuleLine(line[1], 1))
Guido van Rossum986abac1998-04-06 14:29:28 +0000111 else:
Skip Montanaro663f6c22001-01-20 15:59:25 +0000112 _debug("line %d: warning: unknown key %s" % (linenumber,
113 line[0]))
114 else:
115 _debug("line %d: error: malformed line %s"%(linenumber, line))
116 if state==2:
117 self.entries.append(entry)
118 _debug("Parsed rules:\n%s" % str(self))
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000119
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000120
Guido van Rossumdc8b7982000-03-27 19:29:31 +0000121 def can_fetch(self, useragent, url):
122 """using the parsed robots.txt decide if useragent can fetch url"""
Skip Montanaro5bba2312001-02-12 20:58:30 +0000123 _debug("Checking robot.txt allowance for:\n user agent: %s\n url: %s" %
124 (useragent, url))
Skip Montanaro663f6c22001-01-20 15:59:25 +0000125 if self.disallow_all:
126 return 0
127 if self.allow_all:
Guido van Rossum986abac1998-04-06 14:29:28 +0000128 return 1
Skip Montanaro663f6c22001-01-20 15:59:25 +0000129 # search for given user agent matches
130 # the first match counts
Skip Montanaro5bba2312001-02-12 20:58:30 +0000131 url = urllib.quote(urlparse.urlparse(url)[2]) or "/"
Skip Montanaro663f6c22001-01-20 15:59:25 +0000132 for entry in self.entries:
133 if entry.applies_to(useragent):
134 return entry.allowance(url)
135 # agent not found ==> access granted
Guido van Rossum986abac1998-04-06 14:29:28 +0000136 return 1
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000137
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000138
Skip Montanaro663f6c22001-01-20 15:59:25 +0000139 def __str__(self):
140 ret = ""
141 for entry in self.entries:
142 ret = ret + str(entry) + "\n"
143 return ret
144
145
146class RuleLine:
147 """A rule line is a single "Allow:" (allowance==1) or "Disallow:"
148 (allowance==0) followed by a path."""
149 def __init__(self, path, allowance):
150 self.path = urllib.quote(path)
151 self.allowance = allowance
152
153 def applies_to(self, filename):
154 return self.path=="*" or re.match(self.path, filename)
155
156 def __str__(self):
157 return (self.allowance and "Allow" or "Disallow")+": "+self.path
158
159
160class Entry:
161 """An entry has one or more user-agents and zero or more rulelines"""
162 def __init__(self):
163 self.useragents = []
164 self.rulelines = []
165
166 def __str__(self):
167 ret = ""
168 for agent in self.useragents:
169 ret = ret + "User-agent: "+agent+"\n"
170 for line in self.rulelines:
171 ret = ret + str(line) + "\n"
172 return ret
173
174 def applies_to(self, useragent):
Skip Montanaro5bba2312001-02-12 20:58:30 +0000175 """check if this entry applies to the specified agent"""
176 # split the name token and make it lower case
177 useragent = useragent.split("/")[0].lower()
Skip Montanaro663f6c22001-01-20 15:59:25 +0000178 for agent in self.useragents:
Skip Montanaro5bba2312001-02-12 20:58:30 +0000179 if agent=='*':
180 # we have the catch-all agent
Skip Montanaro663f6c22001-01-20 15:59:25 +0000181 return 1
Skip Montanaro5bba2312001-02-12 20:58:30 +0000182 agent = agent.lower()
183 # don't forget to re.escape
184 if re.search(re.escape(useragent), agent):
Skip Montanaro663f6c22001-01-20 15:59:25 +0000185 return 1
186 return 0
187
188 def allowance(self, filename):
189 """Preconditions:
190 - our agent applies to this entry
191 - filename is URL decoded"""
192 for line in self.rulelines:
Skip Montanaro5bba2312001-02-12 20:58:30 +0000193 _debug((filename, str(line), line.allowance))
Skip Montanaro663f6c22001-01-20 15:59:25 +0000194 if line.applies_to(filename):
195 return line.allowance
196 return 1
197
Skip Montanaro5bba2312001-02-12 20:58:30 +0000198class URLopener(urllib.FancyURLopener):
199 def __init__(self, *args):
200 apply(urllib.FancyURLopener.__init__, (self,) + args)
201 self.errcode = 200
202 self.tries = 0
203 self.maxtries = 10
Tim Peters0e6d2132001-02-15 23:56:39 +0000204
Skip Montanaro5bba2312001-02-12 20:58:30 +0000205 def http_error_default(self, url, fp, errcode, errmsg, headers):
206 self.errcode = errcode
207 return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
208 errmsg, headers)
209
210 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
211 self.tries += 1
212 if self.tries >= self.maxtries:
213 return self.http_error_default(url, fp, 500,
214 "Internal Server Error: Redirect Recursion",
215 headers)
216 result = urllib.FancyURLopener.http_error_302(self, url, fp, errcode,
217 errmsg, headers, data)
218 self.tries = 0
219 return result
220
221def _check(a,b):
222 if not b:
223 ac = "access denied"
224 else:
225 ac = "access allowed"
226 if a!=b:
227 print "failed"
228 else:
229 print "ok (%s)" % ac
230 print
Skip Montanaro663f6c22001-01-20 15:59:25 +0000231
232def _test():
233 global debug
234 import sys
235 rp = RobotFileParser()
236 debug = 1
Skip Montanaro5bba2312001-02-12 20:58:30 +0000237
238 # robots.txt that exists, gotten to by redirection
239 rp.set_url('http://www.musi-cal.com/robots.txt')
240 rp.read()
241
242 # test for re.escape
243 _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
244 # this should match the first rule, which is a disallow
245 _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
246 # various cherry pickers
247 _check(rp.can_fetch('CherryPickerSE',
Skip Montanaro663f6c22001-01-20 15:59:25 +0000248 'http://www.musi-cal.com/cgi-bin/event-search'
Skip Montanaro5bba2312001-02-12 20:58:30 +0000249 '?city=San+Francisco'), 0)
250 _check(rp.can_fetch('CherryPickerSE/1.0',
251 'http://www.musi-cal.com/cgi-bin/event-search'
252 '?city=San+Francisco'), 0)
253 _check(rp.can_fetch('CherryPickerSE/1.5',
254 'http://www.musi-cal.com/cgi-bin/event-search'
255 '?city=San+Francisco'), 0)
256 # case sensitivity
257 _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
258 _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
259 # substring test
260 _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
261 # tests for catch-all * agent
262 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
263 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
264 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
265 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
266
267 # robots.txt that does not exist
268 rp.set_url('http://www.lycos.com/robots.txt')
269 rp.read()
270 _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000271
272if __name__ == '__main__':
Guido van Rossumdc8b7982000-03-27 19:29:31 +0000273 _test()