| """ robotparser.py |
| |
| Copyright (C) 2000 Bastian Kleineidam |
| |
| You can choose between two licenses when using this package: |
| 1) GNU GPLv2 |
| 2) PYTHON 2.0 OPEN SOURCE LICENSE |
| |
| The robots.txt Exclusion Protocol is implemented as specified in |
| http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html |
| """ |
| import re,string,urlparse,urllib |
| |
| __all__ = ["RobotFileParser"] |
| |
| debug = 0 |
| |
| def _debug(msg): |
| if debug: print msg |
| |
| |
| class RobotFileParser: |
| def __init__(self, url=''): |
| self.entries = [] |
| self.disallow_all = 0 |
| self.allow_all = 0 |
| self.set_url(url) |
| self.last_checked = 0 |
| |
| def mtime(self): |
| return self.last_checked |
| |
| def modified(self): |
| import time |
| self.last_checked = time.time() |
| |
| def set_url(self, url): |
| self.url = url |
| self.host, self.path = urlparse.urlparse(url)[1:3] |
| |
| def read(self): |
| import httplib |
| tries = 0 |
| while tries<5: |
| connection = httplib.HTTP(self.host) |
| connection.putrequest("GET", self.path) |
| connection.putheader("Host", self.host) |
| connection.endheaders() |
| status, text, mime = connection.getreply() |
| if status in [301,302] and mime: |
| tries = tries + 1 |
| newurl = mime.get("Location", mime.get("Uri", "")) |
| newurl = urlparse.urljoin(self.url, newurl) |
| self.set_url(newurl) |
| else: |
| break |
| if status==401 or status==403: |
| self.disallow_all = 1 |
| elif status>=400: |
| self.allow_all = 1 |
| else: |
| # status < 400 |
| self.parse(connection.getfile().readlines()) |
| |
| def parse(self, lines): |
| """parse the input lines from a robot.txt file. |
| We allow that a user-agent: line is not preceded by |
| one or more blank lines.""" |
| state = 0 |
| linenumber = 0 |
| entry = Entry() |
| |
| for line in lines: |
| line = string.strip(line) |
| linenumber = linenumber + 1 |
| if not line: |
| if state==1: |
| _debug("line %d: warning: you should insert" |
| " allow: or disallow: directives below any" |
| " user-agent: line" % linenumber) |
| entry = Entry() |
| state = 0 |
| elif state==2: |
| self.entries.append(entry) |
| entry = Entry() |
| state = 0 |
| # remove optional comment and strip line |
| i = string.find(line, '#') |
| if i>=0: |
| line = line[:i] |
| line = string.strip(line) |
| if not line: |
| continue |
| line = string.split(line, ':', 1) |
| if len(line) == 2: |
| line[0] = string.lower(string.strip(line[0])) |
| line[1] = string.strip(line[1]) |
| if line[0] == "user-agent": |
| if state==2: |
| _debug("line %d: warning: you should insert a blank" |
| " line before any user-agent" |
| " directive" % linenumber) |
| self.entries.append(entry) |
| entry = Entry() |
| entry.useragents.append(line[1]) |
| state = 1 |
| elif line[0] == "disallow": |
| if state==0: |
| _debug("line %d: error: you must insert a user-agent:" |
| " directive before this line" % linenumber) |
| else: |
| entry.rulelines.append(RuleLine(line[1], 0)) |
| state = 2 |
| elif line[0] == "allow": |
| if state==0: |
| _debug("line %d: error: you must insert a user-agent:" |
| " directive before this line" % linenumber) |
| else: |
| entry.rulelines.append(RuleLine(line[1], 1)) |
| else: |
| _debug("line %d: warning: unknown key %s" % (linenumber, |
| line[0])) |
| else: |
| _debug("line %d: error: malformed line %s"%(linenumber, line)) |
| if state==2: |
| self.entries.append(entry) |
| _debug("Parsed rules:\n%s" % str(self)) |
| |
| |
| def can_fetch(self, useragent, url): |
| """using the parsed robots.txt decide if useragent can fetch url""" |
| _debug("Checking robot.txt allowance for\n%s\n%s" % (useragent, url)) |
| if self.disallow_all: |
| return 0 |
| if self.allow_all: |
| return 1 |
| # search for given user agent matches |
| # the first match counts |
| useragent = string.lower(useragent) |
| url = urllib.quote(urlparse.urlparse(url)[2]) |
| for entry in self.entries: |
| if entry.applies_to(useragent): |
| return entry.allowance(url) |
| # agent not found ==> access granted |
| return 1 |
| |
| |
| def __str__(self): |
| ret = "" |
| for entry in self.entries: |
| ret = ret + str(entry) + "\n" |
| return ret |
| |
| |
| class RuleLine: |
| """A rule line is a single "Allow:" (allowance==1) or "Disallow:" |
| (allowance==0) followed by a path.""" |
| def __init__(self, path, allowance): |
| self.path = urllib.quote(path) |
| self.allowance = allowance |
| |
| def applies_to(self, filename): |
| return self.path=="*" or re.match(self.path, filename) |
| |
| def __str__(self): |
| return (self.allowance and "Allow" or "Disallow")+": "+self.path |
| |
| |
| class Entry: |
| """An entry has one or more user-agents and zero or more rulelines""" |
| def __init__(self): |
| self.useragents = [] |
| self.rulelines = [] |
| |
| def __str__(self): |
| ret = "" |
| for agent in self.useragents: |
| ret = ret + "User-agent: "+agent+"\n" |
| for line in self.rulelines: |
| ret = ret + str(line) + "\n" |
| return ret |
| |
| def applies_to(self, useragent): |
| "check if this entry applies to the specified agent" |
| for agent in self.useragents: |
| if agent=="*": |
| return 1 |
| if re.match(agent, useragent): |
| return 1 |
| return 0 |
| |
| def allowance(self, filename): |
| """Preconditions: |
| - our agent applies to this entry |
| - filename is URL decoded""" |
| for line in self.rulelines: |
| if line.applies_to(filename): |
| return line.allowance |
| return 1 |
| |
| |
| def _test(): |
| global debug |
| import sys |
| rp = RobotFileParser() |
| debug = 1 |
| if len(sys.argv) <= 1: |
| rp.set_url('http://www.musi-cal.com/robots.txt') |
| rp.read() |
| else: |
| rp.parse(open(sys.argv[1]).readlines()) |
| print rp.can_fetch('*', 'http://www.musi-cal.com/') |
| print rp.can_fetch('Musi-Cal-Robot/1.0', |
| 'http://www.musi-cal.com/cgi-bin/event-search' |
| '?city=San+Francisco') |
| |
| if __name__ == '__main__': |
| _test() |