blob: 6f85afacd9d21a605d94f92d6c10cf33d959c2ed [file] [log] [blame]
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +00001"""
2
3Robots.txt file parser class. Accepts a list of lines or robots.txt URL as
4input, builds a set of rules from that list, then answers questions about
5fetchability of other URLs.
6
7"""
8
9class RobotFileParser:
10
11 def __init__(self):
Guido van Rossum986abac1998-04-06 14:29:28 +000012 self.rules = {}
13 self.debug = 0
14 self.url = ''
15 self.last_checked = 0
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000016
17 def mtime(self):
Guido van Rossum986abac1998-04-06 14:29:28 +000018 return self.last_checked
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000019
20 def modified(self):
Guido van Rossum986abac1998-04-06 14:29:28 +000021 import time
22 self.last_checked = time.time()
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000023
24 def set_url(self, url):
Guido van Rossum986abac1998-04-06 14:29:28 +000025 self.url = url
26## import urlmisc
27## self.url = urlmisc.canonical_url(url)
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000028
29 def read(self):
Guido van Rossum986abac1998-04-06 14:29:28 +000030 import urllib
31 self.parse(urllib.urlopen(self.url).readlines())
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000032
33 def parse(self, lines):
Guido van Rossum986abac1998-04-06 14:29:28 +000034 import regsub, string, regex
35 active = []
36 for line in lines:
37 if self.debug: print '>', line,
38 # blank line terminates current record
39 if not line[:-1]:
40 active = []
41 continue
42 # remove optional comment and strip line
43 line = string.strip(line[:string.find(line, '#')])
44 if not line:
45 continue
46 line = regsub.split(line, ' *: *')
47 if len(line) == 2:
48 line[0] = string.lower(line[0])
49 if line[0] == 'user-agent':
50 # this record applies to this user agent
51 if self.debug: print '>> user-agent:', line[1]
52 active.append(line[1])
53 if not self.rules.has_key(line[1]):
54 self.rules[line[1]] = []
55 elif line[0] == 'disallow':
56 if line[1]:
57 if self.debug: print '>> disallow:', line[1]
58 for agent in active:
59 self.rules[agent].append(regex.compile(line[1]))
60 else:
61 pass
62 for agent in active:
63 if self.debug: print '>> allow', agent
64 self.rules[agent] = []
65 else:
66 if self.debug: print '>> unknown:', line
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000067
Guido van Rossum986abac1998-04-06 14:29:28 +000068 self.modified()
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000069
70 # returns true if agent is allowed to fetch url
71 def can_fetch(self, agent, url):
Guido van Rossum986abac1998-04-06 14:29:28 +000072 import urlparse
73 ag = agent
74 if not self.rules.has_key(ag): ag = '*'
75 if not self.rules.has_key(ag):
76 if self.debug: print '>> allowing', url, 'fetch by', agent
77 return 1
78 path = urlparse.urlparse(url)[2]
79 for rule in self.rules[ag]:
80 if rule.match(path) != -1:
81 if self.debug: print '>> disallowing', url, 'fetch by', agent
82 return 0
83 if self.debug: print '>> allowing', url, 'fetch by', agent
84 return 1
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000085
86def test():
87 rp = RobotFileParser()
88 rp.debug = 1
89 rp.set_url('http://www.automatrix.com/robots.txt')
90 rp.read()
91 print rp.rules
92 print rp.can_fetch('*', 'http://www.calendar.com/concerts/')
93 print rp.can_fetch('Musi-Cal-Robot',
Guido van Rossum986abac1998-04-06 14:29:28 +000094 'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000095
96 print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/')
97 print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')