Guido van Rossum | bbf8c2f | 1997-01-30 03:18:23 +0000 | [diff] [blame] | 1 | """ |
| 2 | |
| 3 | Robots.txt file parser class. Accepts a list of lines or robots.txt URL as |
| 4 | input, builds a set of rules from that list, then answers questions about |
| 5 | fetchability of other URLs. |
| 6 | |
| 7 | """ |
| 8 | |
| 9 | class RobotFileParser: |
| 10 | |
| 11 | def __init__(self): |
Guido van Rossum | 986abac | 1998-04-06 14:29:28 +0000 | [diff] [blame] | 12 | self.rules = {} |
| 13 | self.debug = 0 |
| 14 | self.url = '' |
| 15 | self.last_checked = 0 |
Guido van Rossum | bbf8c2f | 1997-01-30 03:18:23 +0000 | [diff] [blame] | 16 | |
| 17 | def mtime(self): |
Guido van Rossum | 986abac | 1998-04-06 14:29:28 +0000 | [diff] [blame] | 18 | return self.last_checked |
Guido van Rossum | bbf8c2f | 1997-01-30 03:18:23 +0000 | [diff] [blame] | 19 | |
| 20 | def modified(self): |
Guido van Rossum | 986abac | 1998-04-06 14:29:28 +0000 | [diff] [blame] | 21 | import time |
| 22 | self.last_checked = time.time() |
Guido van Rossum | bbf8c2f | 1997-01-30 03:18:23 +0000 | [diff] [blame] | 23 | |
| 24 | def set_url(self, url): |
Guido van Rossum | 986abac | 1998-04-06 14:29:28 +0000 | [diff] [blame] | 25 | self.url = url |
Guido van Rossum | bbf8c2f | 1997-01-30 03:18:23 +0000 | [diff] [blame] | 26 | |
| 27 | def read(self): |
Guido van Rossum | 986abac | 1998-04-06 14:29:28 +0000 | [diff] [blame] | 28 | import urllib |
| 29 | self.parse(urllib.urlopen(self.url).readlines()) |
Guido van Rossum | bbf8c2f | 1997-01-30 03:18:23 +0000 | [diff] [blame] | 30 | |
| 31 | def parse(self, lines): |
Guido van Rossum | dc8b798 | 2000-03-27 19:29:31 +0000 | [diff] [blame] | 32 | """parse the input lines from a robot.txt file""" |
| 33 | import string, re |
Guido van Rossum | 986abac | 1998-04-06 14:29:28 +0000 | [diff] [blame] | 34 | active = [] |
| 35 | for line in lines: |
| 36 | if self.debug: print '>', line, |
| 37 | # blank line terminates current record |
| 38 | if not line[:-1]: |
| 39 | active = [] |
| 40 | continue |
| 41 | # remove optional comment and strip line |
| 42 | line = string.strip(line[:string.find(line, '#')]) |
| 43 | if not line: |
| 44 | continue |
Guido van Rossum | dc8b798 | 2000-03-27 19:29:31 +0000 | [diff] [blame] | 45 | line = re.split(' *: *', line) |
Guido van Rossum | 986abac | 1998-04-06 14:29:28 +0000 | [diff] [blame] | 46 | if len(line) == 2: |
| 47 | line[0] = string.lower(line[0]) |
| 48 | if line[0] == 'user-agent': |
| 49 | # this record applies to this user agent |
| 50 | if self.debug: print '>> user-agent:', line[1] |
| 51 | active.append(line[1]) |
| 52 | if not self.rules.has_key(line[1]): |
| 53 | self.rules[line[1]] = [] |
| 54 | elif line[0] == 'disallow': |
| 55 | if line[1]: |
| 56 | if self.debug: print '>> disallow:', line[1] |
| 57 | for agent in active: |
Guido van Rossum | dc8b798 | 2000-03-27 19:29:31 +0000 | [diff] [blame] | 58 | self.rules[agent].append(re.compile(line[1])) |
Guido van Rossum | 986abac | 1998-04-06 14:29:28 +0000 | [diff] [blame] | 59 | else: |
| 60 | pass |
| 61 | for agent in active: |
| 62 | if self.debug: print '>> allow', agent |
| 63 | self.rules[agent] = [] |
| 64 | else: |
| 65 | if self.debug: print '>> unknown:', line |
Guido van Rossum | bbf8c2f | 1997-01-30 03:18:23 +0000 | [diff] [blame] | 66 | |
Guido van Rossum | 986abac | 1998-04-06 14:29:28 +0000 | [diff] [blame] | 67 | self.modified() |
Guido van Rossum | bbf8c2f | 1997-01-30 03:18:23 +0000 | [diff] [blame] | 68 | |
| 69 | # returns true if agent is allowed to fetch url |
Guido van Rossum | dc8b798 | 2000-03-27 19:29:31 +0000 | [diff] [blame] | 70 | def can_fetch(self, useragent, url): |
| 71 | """using the parsed robots.txt decide if useragent can fetch url""" |
Guido van Rossum | 986abac | 1998-04-06 14:29:28 +0000 | [diff] [blame] | 72 | import urlparse |
Guido van Rossum | dc8b798 | 2000-03-27 19:29:31 +0000 | [diff] [blame] | 73 | ag = useragent |
Guido van Rossum | 986abac | 1998-04-06 14:29:28 +0000 | [diff] [blame] | 74 | if not self.rules.has_key(ag): ag = '*' |
| 75 | if not self.rules.has_key(ag): |
Guido van Rossum | dc8b798 | 2000-03-27 19:29:31 +0000 | [diff] [blame] | 76 | if self.debug: print '>> allowing', url, 'fetch by', useragent |
Guido van Rossum | 986abac | 1998-04-06 14:29:28 +0000 | [diff] [blame] | 77 | return 1 |
| 78 | path = urlparse.urlparse(url)[2] |
| 79 | for rule in self.rules[ag]: |
Guido van Rossum | dc8b798 | 2000-03-27 19:29:31 +0000 | [diff] [blame] | 80 | if rule.match(path) is not None: |
| 81 | if self.debug: print '>> disallowing', url, 'fetch by', useragent |
Guido van Rossum | 986abac | 1998-04-06 14:29:28 +0000 | [diff] [blame] | 82 | return 0 |
Guido van Rossum | dc8b798 | 2000-03-27 19:29:31 +0000 | [diff] [blame] | 83 | if self.debug: print '>> allowing', url, 'fetch by', useragent |
Guido van Rossum | 986abac | 1998-04-06 14:29:28 +0000 | [diff] [blame] | 84 | return 1 |
Guido van Rossum | bbf8c2f | 1997-01-30 03:18:23 +0000 | [diff] [blame] | 85 | |
Guido van Rossum | dc8b798 | 2000-03-27 19:29:31 +0000 | [diff] [blame] | 86 | def _test(): |
Guido van Rossum | bbf8c2f | 1997-01-30 03:18:23 +0000 | [diff] [blame] | 87 | rp = RobotFileParser() |
| 88 | rp.debug = 1 |
Guido van Rossum | dc8b798 | 2000-03-27 19:29:31 +0000 | [diff] [blame] | 89 | rp.set_url('http://www.musi-cal.com/robots.txt') |
Guido van Rossum | bbf8c2f | 1997-01-30 03:18:23 +0000 | [diff] [blame] | 90 | rp.read() |
| 91 | print rp.rules |
Guido van Rossum | dc8b798 | 2000-03-27 19:29:31 +0000 | [diff] [blame] | 92 | print rp.can_fetch('*', 'http://www.musi-cal.com.com/') |
Guido van Rossum | bbf8c2f | 1997-01-30 03:18:23 +0000 | [diff] [blame] | 93 | print rp.can_fetch('Musi-Cal-Robot', |
Guido van Rossum | dc8b798 | 2000-03-27 19:29:31 +0000 | [diff] [blame] | 94 | 'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco') |
Guido van Rossum | bbf8c2f | 1997-01-30 03:18:23 +0000 | [diff] [blame] | 95 | |
Guido van Rossum | dc8b798 | 2000-03-27 19:29:31 +0000 | [diff] [blame] | 96 | if __name__ == "__main__": |
| 97 | _test() |