blob: e0ff72bb34cf7411b9d632c8239fc1438cc128e5 [file] [log] [blame]
Skip Montanaro663f6c22001-01-20 15:59:25 +00001""" robotparser.py
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +00002
Skip Montanaro663f6c22001-01-20 15:59:25 +00003 Copyright (C) 2000 Bastian Kleineidam
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +00004
Skip Montanaro663f6c22001-01-20 15:59:25 +00005 You can choose between two licenses when using this package:
6 1) GNU GPLv2
7 2) PYTHON 2.0 OPEN SOURCE LICENSE
8
9 The robots.txt Exclusion Protocol is implemented as specified in
10 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000011"""
Skip Montanaro663f6c22001-01-20 15:59:25 +000012import re,string,urlparse,urllib
13
14debug = 0
15
16def _debug(msg):
17 if debug: print msg
18
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000019
20class RobotFileParser:
Skip Montanaro663f6c22001-01-20 15:59:25 +000021 def __init__(self, url=''):
22 self.entries = []
23 self.disallow_all = 0
24 self.allow_all = 0
25 self.set_url(url)
Guido van Rossum986abac1998-04-06 14:29:28 +000026 self.last_checked = 0
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000027
28 def mtime(self):
Guido van Rossum986abac1998-04-06 14:29:28 +000029 return self.last_checked
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000030
31 def modified(self):
Guido van Rossum986abac1998-04-06 14:29:28 +000032 import time
33 self.last_checked = time.time()
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000034
35 def set_url(self, url):
Guido van Rossum986abac1998-04-06 14:29:28 +000036 self.url = url
Skip Montanaro663f6c22001-01-20 15:59:25 +000037 self.host, self.path = urlparse.urlparse(url)[1:3]
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000038
39 def read(self):
Skip Montanaro663f6c22001-01-20 15:59:25 +000040 import httplib
41 tries = 0
42 while tries<5:
43 connection = httplib.HTTP(self.host)
44 connection.putrequest("GET", self.path)
45 connection.putheader("Host", self.host)
46 connection.endheaders()
47 status, text, mime = connection.getreply()
48 if status in [301,302] and mime:
49 tries = tries + 1
50 newurl = mime.get("Location", mime.get("Uri", ""))
51 newurl = urlparse.urljoin(self.url, newurl)
52 self.set_url(newurl)
53 else:
54 break
55 if status==401 or status==403:
56 self.disallow_all = 1
57 elif status>=400:
58 self.allow_all = 1
59 else:
60 # status < 400
61 self.parse(connection.getfile().readlines())
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000062
63 def parse(self, lines):
Skip Montanaro663f6c22001-01-20 15:59:25 +000064 """parse the input lines from a robot.txt file.
65 We allow that a user-agent: line is not preceded by
66 one or more blank lines."""
67 state = 0
68 linenumber = 0
69 entry = Entry()
70
Guido van Rossum986abac1998-04-06 14:29:28 +000071 for line in lines:
Skip Montanaro663f6c22001-01-20 15:59:25 +000072 line = string.strip(line)
73 linenumber = linenumber + 1
74 if not line:
75 if state==1:
76 _debug("line %d: warning: you should insert"
77 " allow: or disallow: directives below any"
78 " user-agent: line" % linenumber)
79 entry = Entry()
80 state = 0
81 elif state==2:
82 self.entries.append(entry)
83 entry = Entry()
84 state = 0
Guido van Rossum986abac1998-04-06 14:29:28 +000085 # remove optional comment and strip line
Skip Montanaro663f6c22001-01-20 15:59:25 +000086 i = string.find(line, '#')
87 if i>=0:
88 line = line[:i]
89 line = string.strip(line)
Guido van Rossum986abac1998-04-06 14:29:28 +000090 if not line:
91 continue
Skip Montanaro663f6c22001-01-20 15:59:25 +000092 line = string.split(line, ':', 1)
Guido van Rossum986abac1998-04-06 14:29:28 +000093 if len(line) == 2:
Skip Montanaro663f6c22001-01-20 15:59:25 +000094 line[0] = string.lower(string.strip(line[0]))
95 line[1] = string.strip(line[1])
96 if line[0] == "user-agent":
97 if state==2:
98 _debug("line %d: warning: you should insert a blank"
99 " line before any user-agent"
100 " directive" % linenumber)
101 self.entries.append(entry)
102 entry = Entry()
103 entry.useragents.append(line[1])
104 state = 1
105 elif line[0] == "disallow":
106 if state==0:
107 _debug("line %d: error: you must insert a user-agent:"
108 " directive before this line" % linenumber)
Guido van Rossum986abac1998-04-06 14:29:28 +0000109 else:
Skip Montanaro663f6c22001-01-20 15:59:25 +0000110 entry.rulelines.append(RuleLine(line[1], 0))
111 state = 2
112 elif line[0] == "allow":
113 if state==0:
114 _debug("line %d: error: you must insert a user-agent:"
115 " directive before this line" % linenumber)
116 else:
117 entry.rulelines.append(RuleLine(line[1], 1))
Guido van Rossum986abac1998-04-06 14:29:28 +0000118 else:
Skip Montanaro663f6c22001-01-20 15:59:25 +0000119 _debug("line %d: warning: unknown key %s" % (linenumber,
120 line[0]))
121 else:
122 _debug("line %d: error: malformed line %s"%(linenumber, line))
123 if state==2:
124 self.entries.append(entry)
125 _debug("Parsed rules:\n%s" % str(self))
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000126
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000127
Guido van Rossumdc8b7982000-03-27 19:29:31 +0000128 def can_fetch(self, useragent, url):
129 """using the parsed robots.txt decide if useragent can fetch url"""
Skip Montanaro663f6c22001-01-20 15:59:25 +0000130 _debug("Checking robot.txt allowance for\n%s\n%s" % (useragent, url))
131 if self.disallow_all:
132 return 0
133 if self.allow_all:
Guido van Rossum986abac1998-04-06 14:29:28 +0000134 return 1
Skip Montanaro663f6c22001-01-20 15:59:25 +0000135 # search for given user agent matches
136 # the first match counts
137 useragent = string.lower(useragent)
138 url = urllib.quote(urlparse.urlparse(url)[2])
139 for entry in self.entries:
140 if entry.applies_to(useragent):
141 return entry.allowance(url)
142 # agent not found ==> access granted
Guido van Rossum986abac1998-04-06 14:29:28 +0000143 return 1
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000144
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000145
Skip Montanaro663f6c22001-01-20 15:59:25 +0000146 def __str__(self):
147 ret = ""
148 for entry in self.entries:
149 ret = ret + str(entry) + "\n"
150 return ret
151
152
153class RuleLine:
154 """A rule line is a single "Allow:" (allowance==1) or "Disallow:"
155 (allowance==0) followed by a path."""
156 def __init__(self, path, allowance):
157 self.path = urllib.quote(path)
158 self.allowance = allowance
159
160 def applies_to(self, filename):
161 return self.path=="*" or re.match(self.path, filename)
162
163 def __str__(self):
164 return (self.allowance and "Allow" or "Disallow")+": "+self.path
165
166
167class Entry:
168 """An entry has one or more user-agents and zero or more rulelines"""
169 def __init__(self):
170 self.useragents = []
171 self.rulelines = []
172
173 def __str__(self):
174 ret = ""
175 for agent in self.useragents:
176 ret = ret + "User-agent: "+agent+"\n"
177 for line in self.rulelines:
178 ret = ret + str(line) + "\n"
179 return ret
180
181 def applies_to(self, useragent):
182 "check if this entry applies to the specified agent"
183 for agent in self.useragents:
184 if agent=="*":
185 return 1
186 if re.match(agent, useragent):
187 return 1
188 return 0
189
190 def allowance(self, filename):
191 """Preconditions:
192 - our agent applies to this entry
193 - filename is URL decoded"""
194 for line in self.rulelines:
195 if line.applies_to(filename):
196 return line.allowance
197 return 1
198
199
200def _test():
201 global debug
202 import sys
203 rp = RobotFileParser()
204 debug = 1
205 if len(sys.argv) <= 1:
206 rp.set_url('http://www.musi-cal.com/robots.txt')
207 rp.read()
208 else:
209 rp.parse(open(sys.argv[1]).readlines())
210 print rp.can_fetch('*', 'http://www.musi-cal.com/')
211 print rp.can_fetch('Musi-Cal-Robot/1.0',
212 'http://www.musi-cal.com/cgi-bin/event-search'
213 '?city=San+Francisco')
214
215if __name__ == '__main__':
Guido van Rossumdc8b7982000-03-27 19:29:31 +0000216 _test()