| """ | 
 |  | 
 | Robots.txt file parser class.  Accepts a list of lines or robots.txt URL as | 
 | input, builds a set of rules from that list, then answers questions about | 
 | fetchability of other URLs. | 
 |  | 
 | """ | 
 |  | 
 | class RobotFileParser: | 
 |  | 
 |     def __init__(self): | 
 | 	self.rules = {} | 
 | 	self.debug = 0 | 
 | 	self.url = '' | 
 | 	self.last_checked = 0 | 
 |  | 
 |     def mtime(self): | 
 | 	return self.last_checked | 
 |  | 
 |     def modified(self): | 
 | 	import time | 
 | 	self.last_checked = time.time() | 
 |  | 
 |     def set_url(self, url): | 
 | 	self.url = url | 
 | ## 	import urlmisc | 
 | ## 	self.url = urlmisc.canonical_url(url) | 
 |  | 
 |     def read(self): | 
 | 	import urllib | 
 | 	self.parse(urllib.urlopen(self.url).readlines()) | 
 |  | 
 |     def parse(self, lines): | 
 | 	import regsub, string, regex | 
 | 	active = [] | 
 | 	for line in lines: | 
 | 	    if self.debug: print '>', line, | 
 | 	    # blank line terminates current record | 
 | 	    if not line[:-1]: | 
 | 		active = [] | 
 | 		continue | 
 | 	    # remove optional comment and strip line | 
 | 	    line = string.strip(line[:string.find(line, '#')]) | 
 | 	    if not line: | 
 | 		continue | 
 | 	    line = regsub.split(line, ' *: *') | 
 | 	    if len(line) == 2: | 
 | 		line[0] = string.lower(line[0]) | 
 | 		if line[0] == 'user-agent': | 
 | 		    # this record applies to this user agent | 
 | 		    if self.debug: print '>> user-agent:', line[1] | 
 | 		    active.append(line[1]) | 
 | 		    if not self.rules.has_key(line[1]): | 
 | 			self.rules[line[1]] = [] | 
 | 		elif line[0] == 'disallow': | 
 | 		    if line[1]: | 
 | 			if self.debug: print '>> disallow:', line[1] | 
 | 			for agent in active: | 
 | 			    self.rules[agent].append(regex.compile(line[1])) | 
 | 		    else: | 
 | 			pass | 
 | 			for agent in active: | 
 | 			    if self.debug: print '>> allow', agent | 
 | 			    self.rules[agent] = [] | 
 | 		else: | 
 | 		    if self.debug: print '>> unknown:', line | 
 |  | 
 | 	self.modified() | 
 |  | 
 |     # returns true if agent is allowed to fetch url | 
 |     def can_fetch(self, agent, url): | 
 | 	import urlparse | 
 | 	ag = agent | 
 | 	if not self.rules.has_key(ag): ag = '*' | 
 | 	if not self.rules.has_key(ag): | 
 | 	    if self.debug: print '>> allowing', url, 'fetch by', agent | 
 | 	    return 1 | 
 | 	path = urlparse.urlparse(url)[2] | 
 | 	for rule in self.rules[ag]: | 
 | 	    if rule.match(path) != -1: | 
 | 		if self.debug: print '>> disallowing', url, 'fetch by', agent | 
 | 		return 0 | 
 | 	if self.debug: print '>> allowing', url, 'fetch by', agent | 
 | 	return 1 | 
 |  | 
 | def test(): | 
 |     rp = RobotFileParser() | 
 |     rp.debug = 1 | 
 |     rp.set_url('http://www.automatrix.com/robots.txt') | 
 |     rp.read() | 
 |     print rp.rules | 
 |     print rp.can_fetch('*', 'http://www.calendar.com/concerts/') | 
 |     print rp.can_fetch('Musi-Cal-Robot', | 
 | 		       'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones') | 
 |  | 
 |     print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/') | 
 |     print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001') |