blob: 5b759d4968d892efe0673097ce50191a7bb2c779 [file] [log] [blame]
Skip Montanaro663f6c22001-01-20 15:59:25 +00001""" robotparser.py
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +00002
Skip Montanaro663f6c22001-01-20 15:59:25 +00003 Copyright (C) 2000 Bastian Kleineidam
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +00004
Skip Montanaro663f6c22001-01-20 15:59:25 +00005 You can choose between two licenses when using this package:
6 1) GNU GPLv2
Martin v. Löwisd22368f2002-03-18 10:41:20 +00007 2) PSF license for Python 2.2
Skip Montanaro663f6c22001-01-20 15:59:25 +00008
9 The robots.txt Exclusion Protocol is implemented as specified in
10 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000011"""
Eric S. Raymond141971f2001-02-09 08:40:40 +000012import re,urlparse,urllib
Skip Montanaro663f6c22001-01-20 15:59:25 +000013
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000014__all__ = ["RobotFileParser"]
15
Skip Montanaro663f6c22001-01-20 15:59:25 +000016debug = 0
17
18def _debug(msg):
19 if debug: print msg
20
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000021
22class RobotFileParser:
Skip Montanaro663f6c22001-01-20 15:59:25 +000023 def __init__(self, url=''):
24 self.entries = []
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000025 self.default_entry = None
Skip Montanaro663f6c22001-01-20 15:59:25 +000026 self.disallow_all = 0
27 self.allow_all = 0
28 self.set_url(url)
Guido van Rossum986abac1998-04-06 14:29:28 +000029 self.last_checked = 0
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000030
31 def mtime(self):
Guido van Rossum986abac1998-04-06 14:29:28 +000032 return self.last_checked
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000033
34 def modified(self):
Guido van Rossum986abac1998-04-06 14:29:28 +000035 import time
36 self.last_checked = time.time()
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000037
38 def set_url(self, url):
Guido van Rossum986abac1998-04-06 14:29:28 +000039 self.url = url
Skip Montanaro663f6c22001-01-20 15:59:25 +000040 self.host, self.path = urlparse.urlparse(url)[1:3]
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000041
42 def read(self):
Skip Montanaro5bba2312001-02-12 20:58:30 +000043 opener = URLopener()
44 f = opener.open(self.url)
Martin v. Löwisd22368f2002-03-18 10:41:20 +000045 lines = []
46 line = f.readline()
47 while line:
48 lines.append(line.strip())
49 line = f.readline()
Skip Montanaro5bba2312001-02-12 20:58:30 +000050 self.errcode = opener.errcode
51 if self.errcode == 401 or self.errcode == 403:
Skip Montanaro663f6c22001-01-20 15:59:25 +000052 self.disallow_all = 1
Skip Montanaro5bba2312001-02-12 20:58:30 +000053 _debug("disallow all")
54 elif self.errcode >= 400:
Skip Montanaro663f6c22001-01-20 15:59:25 +000055 self.allow_all = 1
Skip Montanaro5bba2312001-02-12 20:58:30 +000056 _debug("allow all")
57 elif self.errcode == 200 and lines:
58 _debug("parse lines")
59 self.parse(lines)
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000060
Martin v. Löwis73f570b2002-03-18 10:43:18 +000061 def _add_entry(self, entry):
62 if "*" in entry.useragents:
63 # the default entry is considered last
64 self.default_entry = entry
65 else:
66 self.entries.append(entry)
67
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000068 def parse(self, lines):
Skip Montanaro663f6c22001-01-20 15:59:25 +000069 """parse the input lines from a robot.txt file.
Tim Petersdfc538a2001-01-21 04:49:16 +000070 We allow that a user-agent: line is not preceded by
71 one or more blank lines."""
Skip Montanaro663f6c22001-01-20 15:59:25 +000072 state = 0
73 linenumber = 0
74 entry = Entry()
Tim Petersdfc538a2001-01-21 04:49:16 +000075
Guido van Rossum986abac1998-04-06 14:29:28 +000076 for line in lines:
Skip Montanaro663f6c22001-01-20 15:59:25 +000077 linenumber = linenumber + 1
78 if not line:
79 if state==1:
80 _debug("line %d: warning: you should insert"
Tim Petersdfc538a2001-01-21 04:49:16 +000081 " allow: or disallow: directives below any"
82 " user-agent: line" % linenumber)
Skip Montanaro663f6c22001-01-20 15:59:25 +000083 entry = Entry()
84 state = 0
85 elif state==2:
Martin v. Löwis73f570b2002-03-18 10:43:18 +000086 self._add_entry(entry)
Skip Montanaro663f6c22001-01-20 15:59:25 +000087 entry = Entry()
88 state = 0
Guido van Rossum986abac1998-04-06 14:29:28 +000089 # remove optional comment and strip line
Eric S. Raymond141971f2001-02-09 08:40:40 +000090 i = line.find('#')
Skip Montanaro663f6c22001-01-20 15:59:25 +000091 if i>=0:
92 line = line[:i]
Eric S. Raymond141971f2001-02-09 08:40:40 +000093 line = line.strip()
Guido van Rossum986abac1998-04-06 14:29:28 +000094 if not line:
95 continue
Eric S. Raymond141971f2001-02-09 08:40:40 +000096 line = line.split(':', 1)
Guido van Rossum986abac1998-04-06 14:29:28 +000097 if len(line) == 2:
Eric S. Raymond141971f2001-02-09 08:40:40 +000098 line[0] = line[0].strip().lower()
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000099 line[1] = urllib.unquote(line[1].strip())
Skip Montanaro663f6c22001-01-20 15:59:25 +0000100 if line[0] == "user-agent":
101 if state==2:
102 _debug("line %d: warning: you should insert a blank"
Tim Petersdfc538a2001-01-21 04:49:16 +0000103 " line before any user-agent"
Skip Montanaro663f6c22001-01-20 15:59:25 +0000104 " directive" % linenumber)
Martin v. Löwis73f570b2002-03-18 10:43:18 +0000105 self._add_entry(entry)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000106 entry = Entry()
107 entry.useragents.append(line[1])
108 state = 1
109 elif line[0] == "disallow":
110 if state==0:
111 _debug("line %d: error: you must insert a user-agent:"
Tim Petersdfc538a2001-01-21 04:49:16 +0000112 " directive before this line" % linenumber)
Guido van Rossum986abac1998-04-06 14:29:28 +0000113 else:
Skip Montanaro663f6c22001-01-20 15:59:25 +0000114 entry.rulelines.append(RuleLine(line[1], 0))
115 state = 2
116 elif line[0] == "allow":
117 if state==0:
118 _debug("line %d: error: you must insert a user-agent:"
Tim Petersdfc538a2001-01-21 04:49:16 +0000119 " directive before this line" % linenumber)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000120 else:
121 entry.rulelines.append(RuleLine(line[1], 1))
Guido van Rossum986abac1998-04-06 14:29:28 +0000122 else:
Skip Montanaro663f6c22001-01-20 15:59:25 +0000123 _debug("line %d: warning: unknown key %s" % (linenumber,
124 line[0]))
125 else:
126 _debug("line %d: error: malformed line %s"%(linenumber, line))
127 if state==2:
128 self.entries.append(entry)
129 _debug("Parsed rules:\n%s" % str(self))
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000130
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000131
Guido van Rossumdc8b7982000-03-27 19:29:31 +0000132 def can_fetch(self, useragent, url):
133 """using the parsed robots.txt decide if useragent can fetch url"""
Skip Montanaro5bba2312001-02-12 20:58:30 +0000134 _debug("Checking robot.txt allowance for:\n user agent: %s\n url: %s" %
135 (useragent, url))
Skip Montanaro663f6c22001-01-20 15:59:25 +0000136 if self.disallow_all:
137 return 0
138 if self.allow_all:
Guido van Rossum986abac1998-04-06 14:29:28 +0000139 return 1
Skip Montanaro663f6c22001-01-20 15:59:25 +0000140 # search for given user agent matches
141 # the first match counts
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000142 url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
Skip Montanaro663f6c22001-01-20 15:59:25 +0000143 for entry in self.entries:
144 if entry.applies_to(useragent):
145 return entry.allowance(url)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000146 # try the default entry last
147 if self.default_entry:
148 return self.default_entry.allowance(url)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000149 # agent not found ==> access granted
Guido van Rossum986abac1998-04-06 14:29:28 +0000150 return 1
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000151
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000152
Skip Montanaro663f6c22001-01-20 15:59:25 +0000153 def __str__(self):
154 ret = ""
155 for entry in self.entries:
156 ret = ret + str(entry) + "\n"
157 return ret
158
159
160class RuleLine:
161 """A rule line is a single "Allow:" (allowance==1) or "Disallow:"
162 (allowance==0) followed by a path."""
163 def __init__(self, path, allowance):
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000164 if path == '' and not allowance:
165 # an empty value means allow all
166 allowance = 1
Skip Montanaro663f6c22001-01-20 15:59:25 +0000167 self.path = urllib.quote(path)
168 self.allowance = allowance
169
170 def applies_to(self, filename):
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000171 return self.path=="*" or filename.startswith(self.path)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000172
173 def __str__(self):
174 return (self.allowance and "Allow" or "Disallow")+": "+self.path
175
176
177class Entry:
178 """An entry has one or more user-agents and zero or more rulelines"""
179 def __init__(self):
180 self.useragents = []
181 self.rulelines = []
182
183 def __str__(self):
184 ret = ""
185 for agent in self.useragents:
186 ret = ret + "User-agent: "+agent+"\n"
187 for line in self.rulelines:
188 ret = ret + str(line) + "\n"
189 return ret
190
191 def applies_to(self, useragent):
Skip Montanaro5bba2312001-02-12 20:58:30 +0000192 """check if this entry applies to the specified agent"""
193 # split the name token and make it lower case
194 useragent = useragent.split("/")[0].lower()
Skip Montanaro663f6c22001-01-20 15:59:25 +0000195 for agent in self.useragents:
Skip Montanaro5bba2312001-02-12 20:58:30 +0000196 if agent=='*':
197 # we have the catch-all agent
Skip Montanaro663f6c22001-01-20 15:59:25 +0000198 return 1
Skip Montanaro5bba2312001-02-12 20:58:30 +0000199 agent = agent.lower()
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000200 if useragent.find(agent) != -1:
Skip Montanaro663f6c22001-01-20 15:59:25 +0000201 return 1
202 return 0
203
204 def allowance(self, filename):
205 """Preconditions:
206 - our agent applies to this entry
207 - filename is URL decoded"""
208 for line in self.rulelines:
Skip Montanaro5bba2312001-02-12 20:58:30 +0000209 _debug((filename, str(line), line.allowance))
Skip Montanaro663f6c22001-01-20 15:59:25 +0000210 if line.applies_to(filename):
211 return line.allowance
212 return 1
213
Skip Montanaro5bba2312001-02-12 20:58:30 +0000214class URLopener(urllib.FancyURLopener):
215 def __init__(self, *args):
216 apply(urllib.FancyURLopener.__init__, (self,) + args)
217 self.errcode = 200
Tim Peters0e6d2132001-02-15 23:56:39 +0000218
Skip Montanaro5bba2312001-02-12 20:58:30 +0000219 def http_error_default(self, url, fp, errcode, errmsg, headers):
220 self.errcode = errcode
221 return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
222 errmsg, headers)
223
Skip Montanaro5bba2312001-02-12 20:58:30 +0000224def _check(a,b):
225 if not b:
226 ac = "access denied"
227 else:
228 ac = "access allowed"
229 if a!=b:
230 print "failed"
231 else:
232 print "ok (%s)" % ac
233 print
Skip Montanaro663f6c22001-01-20 15:59:25 +0000234
235def _test():
236 global debug
Skip Montanaro663f6c22001-01-20 15:59:25 +0000237 rp = RobotFileParser()
238 debug = 1
Skip Montanaro5bba2312001-02-12 20:58:30 +0000239
240 # robots.txt that exists, gotten to by redirection
241 rp.set_url('http://www.musi-cal.com/robots.txt')
242 rp.read()
243
244 # test for re.escape
245 _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
246 # this should match the first rule, which is a disallow
247 _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
248 # various cherry pickers
249 _check(rp.can_fetch('CherryPickerSE',
Skip Montanaro663f6c22001-01-20 15:59:25 +0000250 'http://www.musi-cal.com/cgi-bin/event-search'
Skip Montanaro5bba2312001-02-12 20:58:30 +0000251 '?city=San+Francisco'), 0)
252 _check(rp.can_fetch('CherryPickerSE/1.0',
253 'http://www.musi-cal.com/cgi-bin/event-search'
254 '?city=San+Francisco'), 0)
255 _check(rp.can_fetch('CherryPickerSE/1.5',
256 'http://www.musi-cal.com/cgi-bin/event-search'
257 '?city=San+Francisco'), 0)
258 # case sensitivity
259 _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
260 _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
261 # substring test
262 _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
263 # tests for catch-all * agent
264 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
265 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
266 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
267 _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
268
269 # robots.txt that does not exist
270 rp.set_url('http://www.lycos.com/robots.txt')
271 rp.read()
272 _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000273
274if __name__ == '__main__':
Guido van Rossumdc8b7982000-03-27 19:29:31 +0000275 _test()