blob: 3f4396bbf578024a7b7636edaaa1900d3d8ac978 [file] [log] [blame]
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +00001"""
2
3Robots.txt file parser class. Accepts a list of lines or robots.txt URL as
4input, builds a set of rules from that list, then answers questions about
5fetchability of other URLs.
6
7"""
8
9class RobotFileParser:
10
11 def __init__(self):
Guido van Rossum986abac1998-04-06 14:29:28 +000012 self.rules = {}
13 self.debug = 0
14 self.url = ''
15 self.last_checked = 0
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000016
17 def mtime(self):
Guido van Rossum986abac1998-04-06 14:29:28 +000018 return self.last_checked
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000019
20 def modified(self):
Guido van Rossum986abac1998-04-06 14:29:28 +000021 import time
22 self.last_checked = time.time()
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000023
24 def set_url(self, url):
Guido van Rossum986abac1998-04-06 14:29:28 +000025 self.url = url
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000026
27 def read(self):
Guido van Rossum986abac1998-04-06 14:29:28 +000028 import urllib
29 self.parse(urllib.urlopen(self.url).readlines())
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000030
31 def parse(self, lines):
Guido van Rossumdc8b7982000-03-27 19:29:31 +000032 """parse the input lines from a robot.txt file"""
33 import string, re
Guido van Rossum986abac1998-04-06 14:29:28 +000034 active = []
35 for line in lines:
36 if self.debug: print '>', line,
37 # blank line terminates current record
38 if not line[:-1]:
39 active = []
40 continue
41 # remove optional comment and strip line
42 line = string.strip(line[:string.find(line, '#')])
43 if not line:
44 continue
Guido van Rossumdc8b7982000-03-27 19:29:31 +000045 line = re.split(' *: *', line)
Guido van Rossum986abac1998-04-06 14:29:28 +000046 if len(line) == 2:
47 line[0] = string.lower(line[0])
48 if line[0] == 'user-agent':
49 # this record applies to this user agent
50 if self.debug: print '>> user-agent:', line[1]
51 active.append(line[1])
52 if not self.rules.has_key(line[1]):
53 self.rules[line[1]] = []
54 elif line[0] == 'disallow':
55 if line[1]:
56 if self.debug: print '>> disallow:', line[1]
57 for agent in active:
Guido van Rossumdc8b7982000-03-27 19:29:31 +000058 self.rules[agent].append(re.compile(line[1]))
Guido van Rossum986abac1998-04-06 14:29:28 +000059 else:
60 pass
61 for agent in active:
62 if self.debug: print '>> allow', agent
63 self.rules[agent] = []
64 else:
65 if self.debug: print '>> unknown:', line
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000066
Guido van Rossum986abac1998-04-06 14:29:28 +000067 self.modified()
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000068
69 # returns true if agent is allowed to fetch url
Guido van Rossumdc8b7982000-03-27 19:29:31 +000070 def can_fetch(self, useragent, url):
71 """using the parsed robots.txt decide if useragent can fetch url"""
Guido van Rossum986abac1998-04-06 14:29:28 +000072 import urlparse
Guido van Rossumdc8b7982000-03-27 19:29:31 +000073 ag = useragent
Guido van Rossum986abac1998-04-06 14:29:28 +000074 if not self.rules.has_key(ag): ag = '*'
75 if not self.rules.has_key(ag):
Guido van Rossumdc8b7982000-03-27 19:29:31 +000076 if self.debug: print '>> allowing', url, 'fetch by', useragent
Guido van Rossum986abac1998-04-06 14:29:28 +000077 return 1
78 path = urlparse.urlparse(url)[2]
79 for rule in self.rules[ag]:
Guido van Rossumdc8b7982000-03-27 19:29:31 +000080 if rule.match(path) is not None:
81 if self.debug: print '>> disallowing', url, 'fetch by', useragent
Guido van Rossum986abac1998-04-06 14:29:28 +000082 return 0
Guido van Rossumdc8b7982000-03-27 19:29:31 +000083 if self.debug: print '>> allowing', url, 'fetch by', useragent
Guido van Rossum986abac1998-04-06 14:29:28 +000084 return 1
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000085
Guido van Rossumdc8b7982000-03-27 19:29:31 +000086def _test():
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000087 rp = RobotFileParser()
88 rp.debug = 1
Guido van Rossumdc8b7982000-03-27 19:29:31 +000089 rp.set_url('http://www.musi-cal.com/robots.txt')
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000090 rp.read()
91 print rp.rules
Guido van Rossumdc8b7982000-03-27 19:29:31 +000092 print rp.can_fetch('*', 'http://www.musi-cal.com.com/')
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000093 print rp.can_fetch('Musi-Cal-Robot',
Guido van Rossumdc8b7982000-03-27 19:29:31 +000094 'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco')
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000095
Guido van Rossumdc8b7982000-03-27 19:29:31 +000096if __name__ == "__main__":
97 _test()