| """ |
| |
| Robots.txt file parser class. Accepts a list of lines or robots.txt URL as |
| input, builds a set of rules from that list, then answers questions about |
| fetchability of other URLs. |
| |
| """ |
| |
| class RobotFileParser: |
| |
| def __init__(self): |
| self.rules = {} |
| self.debug = 0 |
| self.url = '' |
| self.last_checked = 0 |
| |
| def mtime(self): |
| return self.last_checked |
| |
| def modified(self): |
| import time |
| self.last_checked = time.time() |
| |
| def set_url(self, url): |
| self.url = url |
| |
| def read(self): |
| import urllib |
| self.parse(urllib.urlopen(self.url).readlines()) |
| |
| def parse(self, lines): |
| """parse the input lines from a robot.txt file""" |
| import string, re |
| active = [] |
| for line in lines: |
| if self.debug: print '>', line, |
| # blank line terminates current record |
| if not line[:-1]: |
| active = [] |
| continue |
| # remove optional comment and strip line |
| line = string.strip(line[:string.find(line, '#')]) |
| if not line: |
| continue |
| line = re.split(' *: *', line) |
| if len(line) == 2: |
| line[0] = string.lower(line[0]) |
| if line[0] == 'user-agent': |
| # this record applies to this user agent |
| if self.debug: print '>> user-agent:', line[1] |
| active.append(line[1]) |
| if not self.rules.has_key(line[1]): |
| self.rules[line[1]] = [] |
| elif line[0] == 'disallow': |
| if line[1]: |
| if self.debug: print '>> disallow:', line[1] |
| for agent in active: |
| self.rules[agent].append(re.compile(line[1])) |
| else: |
| pass |
| for agent in active: |
| if self.debug: print '>> allow', agent |
| self.rules[agent] = [] |
| else: |
| if self.debug: print '>> unknown:', line |
| |
| self.modified() |
| |
| # returns true if agent is allowed to fetch url |
| def can_fetch(self, useragent, url): |
| """using the parsed robots.txt decide if useragent can fetch url""" |
| import urlparse |
| ag = useragent |
| if not self.rules.has_key(ag): ag = '*' |
| if not self.rules.has_key(ag): |
| if self.debug: print '>> allowing', url, 'fetch by', useragent |
| return 1 |
| path = urlparse.urlparse(url)[2] |
| for rule in self.rules[ag]: |
| if rule.match(path) is not None: |
| if self.debug: print '>> disallowing', url, 'fetch by', useragent |
| return 0 |
| if self.debug: print '>> allowing', url, 'fetch by', useragent |
| return 1 |
| |
| def _test(): |
| rp = RobotFileParser() |
| rp.debug = 1 |
| rp.set_url('http://www.musi-cal.com/robots.txt') |
| rp.read() |
| print rp.rules |
| print rp.can_fetch('*', 'http://www.musi-cal.com.com/') |
| print rp.can_fetch('Musi-Cal-Robot', |
| 'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco') |
| |
| if __name__ == "__main__": |
| _test() |