blob: 782d623b275b42c0854489cf4bed96ec2c21691e [file] [log] [blame]
Skip Montanaro663f6c22001-01-20 15:59:25 +00001""" robotparser.py
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +00002
Skip Montanaro663f6c22001-01-20 15:59:25 +00003 Copyright (C) 2000 Bastian Kleineidam
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +00004
Skip Montanaro663f6c22001-01-20 15:59:25 +00005 You can choose between two licenses when using this package:
6 1) GNU GPLv2
7 2) PYTHON 2.0 OPEN SOURCE LICENSE
8
9 The robots.txt Exclusion Protocol is implemented as specified in
10 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000011"""
Skip Montanaro663f6c22001-01-20 15:59:25 +000012import re,string,urlparse,urllib
13
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000014__all__ = ["RobotFileParser"]
15
Skip Montanaro663f6c22001-01-20 15:59:25 +000016debug = 0
17
18def _debug(msg):
19 if debug: print msg
20
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000021
22class RobotFileParser:
Skip Montanaro663f6c22001-01-20 15:59:25 +000023 def __init__(self, url=''):
24 self.entries = []
25 self.disallow_all = 0
26 self.allow_all = 0
27 self.set_url(url)
Guido van Rossum986abac1998-04-06 14:29:28 +000028 self.last_checked = 0
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000029
30 def mtime(self):
Guido van Rossum986abac1998-04-06 14:29:28 +000031 return self.last_checked
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000032
33 def modified(self):
Guido van Rossum986abac1998-04-06 14:29:28 +000034 import time
35 self.last_checked = time.time()
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000036
37 def set_url(self, url):
Guido van Rossum986abac1998-04-06 14:29:28 +000038 self.url = url
Skip Montanaro663f6c22001-01-20 15:59:25 +000039 self.host, self.path = urlparse.urlparse(url)[1:3]
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000040
41 def read(self):
Skip Montanaro663f6c22001-01-20 15:59:25 +000042 import httplib
43 tries = 0
44 while tries<5:
45 connection = httplib.HTTP(self.host)
46 connection.putrequest("GET", self.path)
47 connection.putheader("Host", self.host)
48 connection.endheaders()
49 status, text, mime = connection.getreply()
50 if status in [301,302] and mime:
51 tries = tries + 1
52 newurl = mime.get("Location", mime.get("Uri", ""))
53 newurl = urlparse.urljoin(self.url, newurl)
54 self.set_url(newurl)
55 else:
56 break
57 if status==401 or status==403:
58 self.disallow_all = 1
59 elif status>=400:
60 self.allow_all = 1
61 else:
62 # status < 400
63 self.parse(connection.getfile().readlines())
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000064
65 def parse(self, lines):
Skip Montanaro663f6c22001-01-20 15:59:25 +000066 """parse the input lines from a robot.txt file.
67 We allow that a user-agent: line is not preceded by
68 one or more blank lines."""
69 state = 0
70 linenumber = 0
71 entry = Entry()
72
Guido van Rossum986abac1998-04-06 14:29:28 +000073 for line in lines:
Skip Montanaro663f6c22001-01-20 15:59:25 +000074 line = string.strip(line)
75 linenumber = linenumber + 1
76 if not line:
77 if state==1:
78 _debug("line %d: warning: you should insert"
79 " allow: or disallow: directives below any"
80 " user-agent: line" % linenumber)
81 entry = Entry()
82 state = 0
83 elif state==2:
84 self.entries.append(entry)
85 entry = Entry()
86 state = 0
Guido van Rossum986abac1998-04-06 14:29:28 +000087 # remove optional comment and strip line
Skip Montanaro663f6c22001-01-20 15:59:25 +000088 i = string.find(line, '#')
89 if i>=0:
90 line = line[:i]
91 line = string.strip(line)
Guido van Rossum986abac1998-04-06 14:29:28 +000092 if not line:
93 continue
Skip Montanaro663f6c22001-01-20 15:59:25 +000094 line = string.split(line, ':', 1)
Guido van Rossum986abac1998-04-06 14:29:28 +000095 if len(line) == 2:
Skip Montanaro663f6c22001-01-20 15:59:25 +000096 line[0] = string.lower(string.strip(line[0]))
97 line[1] = string.strip(line[1])
98 if line[0] == "user-agent":
99 if state==2:
100 _debug("line %d: warning: you should insert a blank"
101 " line before any user-agent"
102 " directive" % linenumber)
103 self.entries.append(entry)
104 entry = Entry()
105 entry.useragents.append(line[1])
106 state = 1
107 elif line[0] == "disallow":
108 if state==0:
109 _debug("line %d: error: you must insert a user-agent:"
110 " directive before this line" % linenumber)
Guido van Rossum986abac1998-04-06 14:29:28 +0000111 else:
Skip Montanaro663f6c22001-01-20 15:59:25 +0000112 entry.rulelines.append(RuleLine(line[1], 0))
113 state = 2
114 elif line[0] == "allow":
115 if state==0:
116 _debug("line %d: error: you must insert a user-agent:"
117 " directive before this line" % linenumber)
118 else:
119 entry.rulelines.append(RuleLine(line[1], 1))
Guido van Rossum986abac1998-04-06 14:29:28 +0000120 else:
Skip Montanaro663f6c22001-01-20 15:59:25 +0000121 _debug("line %d: warning: unknown key %s" % (linenumber,
122 line[0]))
123 else:
124 _debug("line %d: error: malformed line %s"%(linenumber, line))
125 if state==2:
126 self.entries.append(entry)
127 _debug("Parsed rules:\n%s" % str(self))
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000128
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000129
Guido van Rossumdc8b7982000-03-27 19:29:31 +0000130 def can_fetch(self, useragent, url):
131 """using the parsed robots.txt decide if useragent can fetch url"""
Skip Montanaro663f6c22001-01-20 15:59:25 +0000132 _debug("Checking robot.txt allowance for\n%s\n%s" % (useragent, url))
133 if self.disallow_all:
134 return 0
135 if self.allow_all:
Guido van Rossum986abac1998-04-06 14:29:28 +0000136 return 1
Skip Montanaro663f6c22001-01-20 15:59:25 +0000137 # search for given user agent matches
138 # the first match counts
139 useragent = string.lower(useragent)
140 url = urllib.quote(urlparse.urlparse(url)[2])
141 for entry in self.entries:
142 if entry.applies_to(useragent):
143 return entry.allowance(url)
144 # agent not found ==> access granted
Guido van Rossum986abac1998-04-06 14:29:28 +0000145 return 1
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000146
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000147
Skip Montanaro663f6c22001-01-20 15:59:25 +0000148 def __str__(self):
149 ret = ""
150 for entry in self.entries:
151 ret = ret + str(entry) + "\n"
152 return ret
153
154
155class RuleLine:
156 """A rule line is a single "Allow:" (allowance==1) or "Disallow:"
157 (allowance==0) followed by a path."""
158 def __init__(self, path, allowance):
159 self.path = urllib.quote(path)
160 self.allowance = allowance
161
162 def applies_to(self, filename):
163 return self.path=="*" or re.match(self.path, filename)
164
165 def __str__(self):
166 return (self.allowance and "Allow" or "Disallow")+": "+self.path
167
168
169class Entry:
170 """An entry has one or more user-agents and zero or more rulelines"""
171 def __init__(self):
172 self.useragents = []
173 self.rulelines = []
174
175 def __str__(self):
176 ret = ""
177 for agent in self.useragents:
178 ret = ret + "User-agent: "+agent+"\n"
179 for line in self.rulelines:
180 ret = ret + str(line) + "\n"
181 return ret
182
183 def applies_to(self, useragent):
184 "check if this entry applies to the specified agent"
185 for agent in self.useragents:
186 if agent=="*":
187 return 1
188 if re.match(agent, useragent):
189 return 1
190 return 0
191
192 def allowance(self, filename):
193 """Preconditions:
194 - our agent applies to this entry
195 - filename is URL decoded"""
196 for line in self.rulelines:
197 if line.applies_to(filename):
198 return line.allowance
199 return 1
200
201
202def _test():
203 global debug
204 import sys
205 rp = RobotFileParser()
206 debug = 1
207 if len(sys.argv) <= 1:
208 rp.set_url('http://www.musi-cal.com/robots.txt')
209 rp.read()
210 else:
211 rp.parse(open(sys.argv[1]).readlines())
212 print rp.can_fetch('*', 'http://www.musi-cal.com/')
213 print rp.can_fetch('Musi-Cal-Robot/1.0',
214 'http://www.musi-cal.com/cgi-bin/event-search'
215 '?city=San+Francisco')
216
217if __name__ == '__main__':
Guido van Rossumdc8b7982000-03-27 19:29:31 +0000218 _test()