Lib/robotparser.py - platform/external/python/cpython3 - Gitiles

 """

 Robots.txt file parser class.  Accepts a list of lines or robots.txt URL as
 input, builds a set of rules from that list, then answers questions about
 fetchability of other URLs.

 """

 class RobotFileParser:

     def __init__(self):
 	self.rules = {}
 	self.debug = 0
 	self.url = ''
 	self.last_checked = 0

     def mtime(self):
 	return self.last_checked

     def modified(self):
 	import time
 	self.last_checked = time.time()

     def set_url(self, url):
 	self.url = url
 ## 	import urlmisc
 ## 	self.url = urlmisc.canonical_url(url)

     def read(self):
 	import urllib
 	self.parse(urllib.urlopen(self.url).readlines())

     def parse(self, lines):
 	import regsub, string, regex
 	active = []
 	for line in lines:
 	    if self.debug: print '>', line,
 	    # blank line terminates current record
 	    if not line[:-1]:
 		active = []
 		continue
 	    # remove optional comment and strip line
 	    line = string.strip(line[:string.find(line, '#')])
 	    if not line:
 		continue
 	    line = regsub.split(line, ' *: *')
 	    if len(line) == 2:
 		line[0] = string.lower(line[0])
 		if line[0] == 'user-agent':
 		    # this record applies to this user agent
 		    if self.debug: print '>> user-agent:', line[1]
 		    active.append(line[1])
 		    if not self.rules.has_key(line[1]):
 			self.rules[line[1]] = []
 		elif line[0] == 'disallow':
 		    if line[1]:
 			if self.debug: print '>> disallow:', line[1]
 			for agent in active:
 			    self.rules[agent].append(regex.compile(line[1]))
 		    else:
 			pass
 			for agent in active:
 			    if self.debug: print '>> allow', agent
 			    self.rules[agent] = []
 		else:
 		    if self.debug: print '>> unknown:', line

 	self.modified()

     # returns true if agent is allowed to fetch url
     def can_fetch(self, agent, url):
 	import urlparse
 	ag = agent
 	if not self.rules.has_key(ag): ag = '*'
 	if not self.rules.has_key(ag):
 	    if self.debug: print '>> allowing', url, 'fetch by', agent
 	    return 1
 	path = urlparse.urlparse(url)[2]
 	for rule in self.rules[ag]:
 	    if rule.match(path) != -1:
 		if self.debug: print '>> disallowing', url, 'fetch by', agent
 		return 0
 	if self.debug: print '>> allowing', url, 'fetch by', agent
 	return 1

 def test():
     rp = RobotFileParser()
     rp.debug = 1
     rp.set_url('http://www.automatrix.com/robots.txt')
     rp.read()
     print rp.rules
     print rp.can_fetch('*', 'http://www.calendar.com/concerts/')
     print rp.can_fetch('Musi-Cal-Robot',
 		       'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')

     print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/')
     print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')
	"""

	Robots.txt file parser class. Accepts a list of lines or robots.txt URL as
	input, builds a set of rules from that list, then answers questions about
	fetchability of other URLs.

	"""

	class RobotFileParser:

	def __init__(self):
	self.rules = {}
	self.debug = 0
	self.url = ''
	self.last_checked = 0

	def mtime(self):
	return self.last_checked

	def modified(self):
	import time
	self.last_checked = time.time()

	def set_url(self, url):
	self.url = url
	## import urlmisc
	## self.url = urlmisc.canonical_url(url)

	def read(self):
	import urllib
	self.parse(urllib.urlopen(self.url).readlines())

	def parse(self, lines):
	import regsub, string, regex
	active = []
	for line in lines:
	if self.debug: print '>', line,
	# blank line terminates current record
	if not line[:-1]:
	active = []
	continue
	# remove optional comment and strip line
	line = string.strip(line[:string.find(line, '#')])
	if not line:
	continue
	line = regsub.split(line, ' : ')
	if len(line) == 2:
	line[0] = string.lower(line[0])
	if line[0] == 'user-agent':
	# this record applies to this user agent
	if self.debug: print '>> user-agent:', line[1]
	active.append(line[1])
	if not self.rules.has_key(line[1]):
	self.rules[line[1]] = []
	elif line[0] == 'disallow':
	if line[1]:
	if self.debug: print '>> disallow:', line[1]
	for agent in active:
	self.rules[agent].append(regex.compile(line[1]))
	else:
	pass
	for agent in active:
	if self.debug: print '>> allow', agent
	self.rules[agent] = []
	else:
	if self.debug: print '>> unknown:', line

	self.modified()

	# returns true if agent is allowed to fetch url
	def can_fetch(self, agent, url):
	import urlparse
	ag = agent
	if not self.rules.has_key(ag): ag = '*'
	if not self.rules.has_key(ag):
	if self.debug: print '>> allowing', url, 'fetch by', agent
	return 1
	path = urlparse.urlparse(url)[2]
	for rule in self.rules[ag]:
	if rule.match(path) != -1:
	if self.debug: print '>> disallowing', url, 'fetch by', agent
	return 0
	if self.debug: print '>> allowing', url, 'fetch by', agent
	return 1

	def test():
	rp = RobotFileParser()
	rp.debug = 1
	rp.set_url('http://www.automatrix.com/robots.txt')
	rp.read()
	print rp.rules
	print rp.can_fetch('*', 'http://www.calendar.com/concerts/')
	print rp.can_fetch('Musi-Cal-Robot',
	'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')

	print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/')
	print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')