blob: bfc073928e667def763ba2a1a1771e6961346220 [file] [log] [blame]
Skip Montanaro663f6c22001-01-20 15:59:25 +00001""" robotparser.py
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +00002
Skip Montanaro663f6c22001-01-20 15:59:25 +00003 Copyright (C) 2000 Bastian Kleineidam
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +00004
Skip Montanaro663f6c22001-01-20 15:59:25 +00005 You can choose between two licenses when using this package:
6 1) GNU GPLv2
7 2) PYTHON 2.0 OPEN SOURCE LICENSE
8
9 The robots.txt Exclusion Protocol is implemented as specified in
10 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000011"""
Eric S. Raymond141971f2001-02-09 08:40:40 +000012import re,urlparse,urllib
Skip Montanaro663f6c22001-01-20 15:59:25 +000013
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000014__all__ = ["RobotFileParser"]
15
Skip Montanaro663f6c22001-01-20 15:59:25 +000016debug = 0
17
18def _debug(msg):
19 if debug: print msg
20
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000021
22class RobotFileParser:
Skip Montanaro663f6c22001-01-20 15:59:25 +000023 def __init__(self, url=''):
24 self.entries = []
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000025 self.default_entry = None
Skip Montanaro663f6c22001-01-20 15:59:25 +000026 self.disallow_all = 0
27 self.allow_all = 0
28 self.set_url(url)
Guido van Rossum986abac1998-04-06 14:29:28 +000029 self.last_checked = 0
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000030
31 def mtime(self):
Guido van Rossum986abac1998-04-06 14:29:28 +000032 return self.last_checked
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000033
34 def modified(self):
Guido van Rossum986abac1998-04-06 14:29:28 +000035 import time
36 self.last_checked = time.time()
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000037
38 def set_url(self, url):
Guido van Rossum986abac1998-04-06 14:29:28 +000039 self.url = url
Skip Montanaro663f6c22001-01-20 15:59:25 +000040 self.host, self.path = urlparse.urlparse(url)[1:3]
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000041
42 def read(self):
Skip Montanaro5bba2312001-02-12 20:58:30 +000043 opener = URLopener()
44 f = opener.open(self.url)
45 lines = f.readlines()
46 self.errcode = opener.errcode
47 if self.errcode == 401 or self.errcode == 403:
Skip Montanaro663f6c22001-01-20 15:59:25 +000048 self.disallow_all = 1
Skip Montanaro5bba2312001-02-12 20:58:30 +000049 _debug("disallow all")
50 elif self.errcode >= 400:
Skip Montanaro663f6c22001-01-20 15:59:25 +000051 self.allow_all = 1
Skip Montanaro5bba2312001-02-12 20:58:30 +000052 _debug("allow all")
53 elif self.errcode == 200 and lines:
54 _debug("parse lines")
55 self.parse(lines)
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000056
57 def parse(self, lines):
Skip Montanaro663f6c22001-01-20 15:59:25 +000058 """parse the input lines from a robot.txt file.
Tim Petersdfc538a2001-01-21 04:49:16 +000059 We allow that a user-agent: line is not preceded by
60 one or more blank lines."""
Skip Montanaro663f6c22001-01-20 15:59:25 +000061 state = 0
62 linenumber = 0
63 entry = Entry()
Tim Petersdfc538a2001-01-21 04:49:16 +000064
Guido van Rossum986abac1998-04-06 14:29:28 +000065 for line in lines:
Eric S. Raymond141971f2001-02-09 08:40:40 +000066 line = line.strip()
Skip Montanaro663f6c22001-01-20 15:59:25 +000067 linenumber = linenumber + 1
68 if not line:
69 if state==1:
70 _debug("line %d: warning: you should insert"
Tim Petersdfc538a2001-01-21 04:49:16 +000071 " allow: or disallow: directives below any"
72 " user-agent: line" % linenumber)
Skip Montanaro663f6c22001-01-20 15:59:25 +000073 entry = Entry()
74 state = 0
75 elif state==2:
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000076 if "*" in entry.useragents:
77 # the default entry is considered last
78 self.default_entry = entry
79 else:
80 self.entries.append(entry)
Skip Montanaro663f6c22001-01-20 15:59:25 +000081 entry = Entry()
82 state = 0
Guido van Rossum986abac1998-04-06 14:29:28 +000083 # remove optional comment and strip line
Eric S. Raymond141971f2001-02-09 08:40:40 +000084 i = line.find('#')
Skip Montanaro663f6c22001-01-20 15:59:25 +000085 if i>=0:
86 line = line[:i]
Eric S. Raymond141971f2001-02-09 08:40:40 +000087 line = line.strip()
Guido van Rossum986abac1998-04-06 14:29:28 +000088 if not line:
89 continue
Eric S. Raymond141971f2001-02-09 08:40:40 +000090 line = line.split(':', 1)
Guido van Rossum986abac1998-04-06 14:29:28 +000091 if len(line) == 2:
Eric S. Raymond141971f2001-02-09 08:40:40 +000092 line[0] = line[0].strip().lower()
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000093 line[1] = urllib.unquote(line[1].strip())
Skip Montanaro663f6c22001-01-20 15:59:25 +000094 if line[0] == "user-agent":
95 if state==2:
96 _debug("line %d: warning: you should insert a blank"
Tim Petersdfc538a2001-01-21 04:49:16 +000097 " line before any user-agent"
Skip Montanaro663f6c22001-01-20 15:59:25 +000098 " directive" % linenumber)
99 self.entries.append(entry)
100 entry = Entry()
101 entry.useragents.append(line[1])
102 state = 1
103 elif line[0] == "disallow":
104 if state==0:
105 _debug("line %d: error: you must insert a user-agent:"
Tim Petersdfc538a2001-01-21 04:49:16 +0000106 " directive before this line" % linenumber)
Guido van Rossum986abac1998-04-06 14:29:28 +0000107 else:
Skip Montanaro663f6c22001-01-20 15:59:25 +0000108 entry.rulelines.append(RuleLine(line[1], 0))
109 state = 2
110 elif line[0] == "allow":
111 if state==0:
112 _debug("line %d: error: you must insert a user-agent:"
Tim Petersdfc538a2001-01-21 04:49:16 +0000113 " directive before this line" % linenumber)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000114 else:
115 entry.rulelines.append(RuleLine(line[1], 1))
Guido van Rossum986abac1998-04-06 14:29:28 +0000116 else:
Skip Montanaro663f6c22001-01-20 15:59:25 +0000117 _debug("line %d: warning: unknown key %s" % (linenumber,
118 line[0]))
119 else:
120 _debug("line %d: error: malformed line %s"%(linenumber, line))
121 if state==2:
122 self.entries.append(entry)
123 _debug("Parsed rules:\n%s" % str(self))
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000124
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000125
Guido van Rossumdc8b7982000-03-27 19:29:31 +0000126 def can_fetch(self, useragent, url):
127 """using the parsed robots.txt decide if useragent can fetch url"""
Skip Montanaro5bba2312001-02-12 20:58:30 +0000128 _debug("Checking robot.txt allowance for:\n user agent: %s\n url: %s" %
129 (useragent, url))
Skip Montanaro663f6c22001-01-20 15:59:25 +0000130 if self.disallow_all:
131 return 0
132 if self.allow_all:
Guido van Rossum986abac1998-04-06 14:29:28 +0000133 return 1
Skip Montanaro663f6c22001-01-20 15:59:25 +0000134 # search for given user agent matches
135 # the first match counts
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000136 url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
Skip Montanaro663f6c22001-01-20 15:59:25 +0000137 for entry in self.entries:
138 if entry.applies_to(useragent):
139 return entry.allowance(url)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000140 # try the default entry last
141 if self.default_entry:
142 return self.default_entry.allowance(url)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000143 # agent not found ==> access granted
Guido van Rossum986abac1998-04-06 14:29:28 +0000144 return 1
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000145
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000146
Skip Montanaro663f6c22001-01-20 15:59:25 +0000147 def __str__(self):
148 ret = ""
149 for entry in self.entries:
150 ret = ret + str(entry) + "\n"
151 return ret
152
153
154class RuleLine:
155 """A rule line is a single "Allow:" (allowance==1) or "Disallow:"
156 (allowance==0) followed by a path."""
157 def __init__(self, path, allowance):
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000158 if path == '' and not allowance:
159 # an empty value means allow all
160 allowance = 1
Skip Montanaro663f6c22001-01-20 15:59:25 +0000161 self.path = urllib.quote(path)
162 self.allowance = allowance
163
164 def applies_to(self, filename):
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000165 return self.path=="*" or filename.startswith(self.path)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000166
167 def __str__(self):
168 return (self.allowance and "Allow" or "Disallow")+": "+self.path
169
170
171class Entry:
172 """An entry has one or more user-agents and zero or more rulelines"""
173 def __init__(self):
174 self.useragents = []
175 self.rulelines = []
176
177 def __str__(self):
178 ret = ""
179 for agent in self.useragents:
180 ret = ret + "User-agent: "+agent+"\n"
181 for line in self.rulelines:
182 ret = ret + str(line) + "\n"
183 return ret
184
185 def applies_to(self, useragent):
Skip Montanaro5bba2312001-02-12 20:58:30 +0000186 """check if this entry applies to the specified agent"""
187 # split the name token and make it lower case
188 useragent = useragent.split("/")[0].lower()
Skip Montanaro663f6c22001-01-20 15:59:25 +0000189 for agent in self.useragents:
Skip Montanaro5bba2312001-02-12 20:58:30 +0000190 if agent=='*':
191 # we have the catch-all agent
Skip Montanaro663f6c22001-01-20 15:59:25 +0000192 return 1
Skip Montanaro5bba2312001-02-12 20:58:30 +0000193 agent = agent.lower()
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000194 if useragent.find(agent) != -1:
Skip Montanaro663f6c22001-01-20 15:59:25 +0000195 return 1
196 return 0
197
198 def allowance(self, filename):
199 """Preconditions:
200 - our agent applies to this entry
201 - filename is URL decoded"""
202 for line in self.rulelines:
Skip Montanaro5bba2312001-02-12 20:58:30 +0000203 _debug((filename, str(line), line.allowance))
Skip Montanaro663f6c22001-01-20 15:59:25 +0000204 if line.applies_to(filename):
205 return line.allowance
206 return 1
207
Skip Montanaro5bba2312001-02-12 20:58:30 +0000208class URLopener(urllib.FancyURLopener):
209 def __init__(self, *args):
210 apply(urllib.FancyURLopener.__init__, (self,) + args)
211 self.errcode = 200
212 self.tries = 0
213 self.maxtries = 10
Tim Peters0e6d2132001-02-15 23:56:39 +0000214
Skip Montanaro5bba2312001-02-12 20:58:30 +0000215 def http_error_default(self, url, fp, errcode, errmsg, headers):
216 self.errcode = errcode
217 return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
218 errmsg, headers)
219
220 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
221 self.tries += 1
222 if self.tries >= self.maxtries:
223 return self.http_error_default(url, fp, 500,
224 "Internal Server Error: Redirect Recursion",
225 headers)
226 result = urllib.FancyURLopener.http_error_302(self, url, fp, errcode,
227 errmsg, headers, data)
228 self.tries = 0
229 return result
230
231def _check(a,b):
232 if not b:
233 ac = "access denied"
234 else:
235 ac = "access allowed"
236 if a!=b:
237 print "failed"
238 else:
239 print "ok (%s)" % ac
240 print
Skip Montanaro663f6c22001-01-20 15:59:25 +0000241
242def _test():
243 global debug
Skip Montanaro663f6c22001-01-20 15:59:25 +0000244 rp = RobotFileParser()
245 debug = 1
Skip Montanaro5bba2312001-02-12 20:58:30 +0000246
247 # robots.txt that exists, gotten to by redirection
248 rp.set_url('http://www.musi-cal.com/robots.txt')
249 rp.read()
250
251 # test for re.escape
252 _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
253 # this should match the first rule, which is a disallow
254 _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
255 # various cherry pickers
256 _check(rp.can_fetch('CherryPickerSE',
Skip Montanaro663f6c22001-01-20 15:59:25 +0000257 'http://www.musi-cal.com/cgi-bin/event-search'
Skip Montanaro5bba2312001-02-12 20:58:30 +0000258 '?city=San+Francisco'), 0)
259 _check(rp.can_fetch('CherryPickerSE/1.0',
260 'http://www.musi-cal.com/cgi-bin/event-search'
261 '?city=San+Francisco'), 0)
262 _check(rp.can_fetch('CherryPickerSE/1.5',
263 'http://www.musi-cal.com/cgi-bin/event-search'
264 '?city=San+Francisco'), 0)
265 # case sensitivity
266 _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
267 _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
268 # substring test
269 _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
270 # tests for catch-all * agent
271 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
272 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
273 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
274 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
275
276 # robots.txt that does not exist
277 rp.set_url('http://www.lycos.com/robots.txt')
278 rp.read()
279 _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000280
281if __name__ == '__main__':
Guido van Rossumdc8b7982000-03-27 19:29:31 +0000282 _test()