Guido van Rossum | bbf8c2f | 1997-01-30 03:18:23 +0000 | [diff] [blame^] | 1 | """ |
| 2 | |
| 3 | Robots.txt file parser class. Accepts a list of lines or robots.txt URL as |
| 4 | input, builds a set of rules from that list, then answers questions about |
| 5 | fetchability of other URLs. |
| 6 | |
| 7 | """ |
| 8 | |
| 9 | class RobotFileParser: |
| 10 | |
| 11 | def __init__(self): |
| 12 | self.rules = {} |
| 13 | self.debug = 0 |
| 14 | self.url = '' |
| 15 | self.last_checked = 0 |
| 16 | |
| 17 | def mtime(self): |
| 18 | return self.last_checked |
| 19 | |
| 20 | def modified(self): |
| 21 | import time |
| 22 | self.last_checked = time.time() |
| 23 | |
| 24 | def set_url(self, url): |
| 25 | self.url = url |
| 26 | ## import urlmisc |
| 27 | ## self.url = urlmisc.canonical_url(url) |
| 28 | |
| 29 | def read(self): |
| 30 | import urllib |
| 31 | self.parse(urllib.urlopen(self.url).readlines()) |
| 32 | |
| 33 | def parse(self, lines): |
| 34 | import regsub, string, regex |
| 35 | active = [] |
| 36 | for line in lines: |
| 37 | if self.debug: print '>', line, |
| 38 | # blank line terminates current record |
| 39 | if not line[:-1]: |
| 40 | active = [] |
| 41 | continue |
| 42 | # remove optional comment and strip line |
| 43 | line = string.strip(line[:string.find(line, '#')]) |
| 44 | if not line: |
| 45 | continue |
| 46 | line = regsub.split(line, ' *: *') |
| 47 | if len(line) == 2: |
| 48 | line[0] = string.lower(line[0]) |
| 49 | if line[0] == 'user-agent': |
| 50 | # this record applies to this user agent |
| 51 | if self.debug: print '>> user-agent:', line[1] |
| 52 | active.append(line[1]) |
| 53 | if not self.rules.has_key(line[1]): |
| 54 | self.rules[line[1]] = [] |
| 55 | elif line[0] == 'disallow': |
| 56 | if line[1]: |
| 57 | if self.debug: print '>> disallow:', line[1] |
| 58 | for agent in active: |
| 59 | self.rules[agent].append(regex.compile(line[1])) |
| 60 | else: |
| 61 | pass |
| 62 | for agent in active: |
| 63 | if self.debug: print '>> allow', agent |
| 64 | self.rules[agent] = [] |
| 65 | else: |
| 66 | if self.debug: print '>> unknown:', line |
| 67 | |
| 68 | self.modified() |
| 69 | |
| 70 | # returns true if agent is allowed to fetch url |
| 71 | def can_fetch(self, agent, url): |
| 72 | import urlparse |
| 73 | ag = agent |
| 74 | if not self.rules.has_key(ag): ag = '*' |
| 75 | if not self.rules.has_key(ag): |
| 76 | if self.debug: print '>> allowing', url, 'fetch by', agent |
| 77 | return 1 |
| 78 | path = urlparse.urlparse(url)[2] |
| 79 | for rule in self.rules[ag]: |
| 80 | if rule.match(path) != -1: |
| 81 | if self.debug: print '>> disallowing', url, 'fetch by', agent |
| 82 | return 0 |
| 83 | if self.debug: print '>> allowing', url, 'fetch by', agent |
| 84 | return 1 |
| 85 | |
| 86 | def test(): |
| 87 | rp = RobotFileParser() |
| 88 | rp.debug = 1 |
| 89 | rp.set_url('http://www.automatrix.com/robots.txt') |
| 90 | rp.read() |
| 91 | print rp.rules |
| 92 | print rp.can_fetch('*', 'http://www.calendar.com/concerts/') |
| 93 | print rp.can_fetch('Musi-Cal-Robot', |
| 94 | 'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones') |
| 95 | |
| 96 | print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/') |
| 97 | print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001') |