blob: 30baa055d2a8be95bc4589596f98fc184b47a903 [file] [log] [blame]
Skip Montanaro663f6c22001-01-20 15:59:25 +00001""" robotparser.py
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +00002
Skip Montanaro663f6c22001-01-20 15:59:25 +00003 Copyright (C) 2000 Bastian Kleineidam
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +00004
Skip Montanaro663f6c22001-01-20 15:59:25 +00005 You can choose between two licenses when using this package:
6 1) GNU GPLv2
Martin v. Löwisd22368f2002-03-18 10:41:20 +00007 2) PSF license for Python 2.2
Skip Montanaro663f6c22001-01-20 15:59:25 +00008
9 The robots.txt Exclusion Protocol is implemented as specified in
10 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000011"""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000012
13import urllib.parse, urllib.request
Skip Montanaro663f6c22001-01-20 15:59:25 +000014
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000015__all__ = ["RobotFileParser"]
16
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000017class RobotFileParser:
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000018 """ This class provides a set of methods to read, parse and answer
19 questions about a single robots.txt file.
20
21 """
22
Skip Montanaro663f6c22001-01-20 15:59:25 +000023 def __init__(self, url=''):
24 self.entries = []
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000025 self.default_entry = None
Martin v. Löwis31bd5292004-08-23 20:42:35 +000026 self.disallow_all = False
27 self.allow_all = False
Skip Montanaro663f6c22001-01-20 15:59:25 +000028 self.set_url(url)
Guido van Rossum986abac1998-04-06 14:29:28 +000029 self.last_checked = 0
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000030
31 def mtime(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000032 """Returns the time the robots.txt file was last fetched.
33
34 This is useful for long-running web spiders that need to
35 check for new robots.txt files periodically.
36
37 """
Guido van Rossum986abac1998-04-06 14:29:28 +000038 return self.last_checked
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000039
40 def modified(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000041 """Sets the time the robots.txt file was last fetched to the
42 current time.
43
44 """
Guido van Rossum986abac1998-04-06 14:29:28 +000045 import time
46 self.last_checked = time.time()
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000047
48 def set_url(self, url):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000049 """Sets the URL referring to a robots.txt file."""
Guido van Rossum986abac1998-04-06 14:29:28 +000050 self.url = url
Jeremy Hylton1afc1692008-06-18 20:49:58 +000051 self.host, self.path = urllib.parse.urlparse(url)[1:3]
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000052
53 def read(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000054 """Reads the robots.txt URL and feeds it to the parser."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000055 try:
56 f = urllib.request.urlopen(self.url)
57 except urllib.error.HTTPError as err:
58 if err.code in (401, 403):
59 self.disallow_all = True
60 elif err.code >= 400:
61 self.allow_all = True
62 else:
Jeremy Hylton73fd46d2008-07-18 20:59:44 +000063 raw = f.read()
64 self.parse(raw.decode("utf-8").splitlines())
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000065
Martin v. Löwis73f570b2002-03-18 10:43:18 +000066 def _add_entry(self, entry):
67 if "*" in entry.useragents:
68 # the default entry is considered last
69 self.default_entry = entry
70 else:
71 self.entries.append(entry)
72
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000073 def parse(self, lines):
Jeremy Hylton1afc1692008-06-18 20:49:58 +000074 """Parse the input lines from a robots.txt file.
75
76 We allow that a user-agent: line is not preceded by
77 one or more blank lines.
78 """
Benjamin Petersond6313712008-07-31 16:23:04 +000079 # states:
80 # 0: start state
81 # 1: saw user-agent line
82 # 2: saw an allow or disallow line
Skip Montanaro663f6c22001-01-20 15:59:25 +000083 state = 0
Skip Montanaro663f6c22001-01-20 15:59:25 +000084 entry = Entry()
Tim Petersdfc538a2001-01-21 04:49:16 +000085
Guido van Rossum986abac1998-04-06 14:29:28 +000086 for line in lines:
Skip Montanaro663f6c22001-01-20 15:59:25 +000087 if not line:
Christian Heimes81ee3ef2008-05-04 22:42:01 +000088 if state == 1:
Skip Montanaro663f6c22001-01-20 15:59:25 +000089 entry = Entry()
90 state = 0
Christian Heimes81ee3ef2008-05-04 22:42:01 +000091 elif state == 2:
Martin v. Löwis73f570b2002-03-18 10:43:18 +000092 self._add_entry(entry)
Skip Montanaro663f6c22001-01-20 15:59:25 +000093 entry = Entry()
94 state = 0
Guido van Rossum986abac1998-04-06 14:29:28 +000095 # remove optional comment and strip line
Eric S. Raymond141971f2001-02-09 08:40:40 +000096 i = line.find('#')
Christian Heimes81ee3ef2008-05-04 22:42:01 +000097 if i >= 0:
Skip Montanaro663f6c22001-01-20 15:59:25 +000098 line = line[:i]
Eric S. Raymond141971f2001-02-09 08:40:40 +000099 line = line.strip()
Guido van Rossum986abac1998-04-06 14:29:28 +0000100 if not line:
101 continue
Eric S. Raymond141971f2001-02-09 08:40:40 +0000102 line = line.split(':', 1)
Guido van Rossum986abac1998-04-06 14:29:28 +0000103 if len(line) == 2:
Eric S. Raymond141971f2001-02-09 08:40:40 +0000104 line[0] = line[0].strip().lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000105 line[1] = urllib.parse.unquote(line[1].strip())
Skip Montanaro663f6c22001-01-20 15:59:25 +0000106 if line[0] == "user-agent":
Christian Heimes81ee3ef2008-05-04 22:42:01 +0000107 if state == 2:
Martin v. Löwis73f570b2002-03-18 10:43:18 +0000108 self._add_entry(entry)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000109 entry = Entry()
110 entry.useragents.append(line[1])
111 state = 1
112 elif line[0] == "disallow":
Christian Heimes81ee3ef2008-05-04 22:42:01 +0000113 if state != 0:
Martin v. Löwis31bd5292004-08-23 20:42:35 +0000114 entry.rulelines.append(RuleLine(line[1], False))
Skip Montanaro663f6c22001-01-20 15:59:25 +0000115 state = 2
116 elif line[0] == "allow":
Christian Heimes81ee3ef2008-05-04 22:42:01 +0000117 if state != 0:
Martin v. Löwis31bd5292004-08-23 20:42:35 +0000118 entry.rulelines.append(RuleLine(line[1], True))
Benjamin Petersond6313712008-07-31 16:23:04 +0000119 state = 2
Christian Heimes81ee3ef2008-05-04 22:42:01 +0000120 if state == 2:
Skip Montanaro663f6c22001-01-20 15:59:25 +0000121 self.entries.append(entry)
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000122
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000123
Guido van Rossumdc8b7982000-03-27 19:29:31 +0000124 def can_fetch(self, useragent, url):
125 """using the parsed robots.txt decide if useragent can fetch url"""
Skip Montanaro663f6c22001-01-20 15:59:25 +0000126 if self.disallow_all:
Tim Petersbc0e9102002-04-04 22:55:58 +0000127 return False
Skip Montanaro663f6c22001-01-20 15:59:25 +0000128 if self.allow_all:
Tim Petersbc0e9102002-04-04 22:55:58 +0000129 return True
Skip Montanaro663f6c22001-01-20 15:59:25 +0000130 # search for given user agent matches
131 # the first match counts
Senthil Kumaran42b0c2f2010-07-28 16:30:46 +0000132 parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
133 url = urllib.parse.urlunparse(('','',parsed_url.path,
134 parsed_url.params,parsed_url.query, parsed_url.fragment))
135 url = urllib.parse.quote(url)
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000136 if not url:
137 url = "/"
Skip Montanaro663f6c22001-01-20 15:59:25 +0000138 for entry in self.entries:
139 if entry.applies_to(useragent):
140 return entry.allowance(url)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000141 # try the default entry last
142 if self.default_entry:
143 return self.default_entry.allowance(url)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000144 # agent not found ==> access granted
Tim Petersbc0e9102002-04-04 22:55:58 +0000145 return True
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000146
Skip Montanaro663f6c22001-01-20 15:59:25 +0000147 def __str__(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000148 return ''.join([str(entry) + "\n" for entry in self.entries])
Skip Montanaro663f6c22001-01-20 15:59:25 +0000149
150
151class RuleLine:
Martin v. Löwis31bd5292004-08-23 20:42:35 +0000152 """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
153 (allowance==False) followed by a path."""
Skip Montanaro663f6c22001-01-20 15:59:25 +0000154 def __init__(self, path, allowance):
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000155 if path == '' and not allowance:
156 # an empty value means allow all
Martin v. Löwis31bd5292004-08-23 20:42:35 +0000157 allowance = True
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000158 self.path = urllib.parse.quote(path)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000159 self.allowance = allowance
160
161 def applies_to(self, filename):
Christian Heimes81ee3ef2008-05-04 22:42:01 +0000162 return self.path == "*" or filename.startswith(self.path)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000163
164 def __str__(self):
Christian Heimes81ee3ef2008-05-04 22:42:01 +0000165 return (self.allowance and "Allow" or "Disallow") + ": " + self.path
Skip Montanaro663f6c22001-01-20 15:59:25 +0000166
167
168class Entry:
169 """An entry has one or more user-agents and zero or more rulelines"""
170 def __init__(self):
171 self.useragents = []
172 self.rulelines = []
173
174 def __str__(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000175 ret = []
Skip Montanaro663f6c22001-01-20 15:59:25 +0000176 for agent in self.useragents:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000177 ret.extend(["User-agent: ", agent, "\n"])
Skip Montanaro663f6c22001-01-20 15:59:25 +0000178 for line in self.rulelines:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000179 ret.extend([str(line), "\n"])
180 return ''.join(ret)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000181
182 def applies_to(self, useragent):
Skip Montanaro5bba2312001-02-12 20:58:30 +0000183 """check if this entry applies to the specified agent"""
184 # split the name token and make it lower case
185 useragent = useragent.split("/")[0].lower()
Skip Montanaro663f6c22001-01-20 15:59:25 +0000186 for agent in self.useragents:
Christian Heimes81ee3ef2008-05-04 22:42:01 +0000187 if agent == '*':
Skip Montanaro5bba2312001-02-12 20:58:30 +0000188 # we have the catch-all agent
Tim Petersbc0e9102002-04-04 22:55:58 +0000189 return True
Skip Montanaro5bba2312001-02-12 20:58:30 +0000190 agent = agent.lower()
Raymond Hettingerbac788a2004-05-04 09:21:43 +0000191 if agent in useragent:
Tim Petersbc0e9102002-04-04 22:55:58 +0000192 return True
193 return False
Skip Montanaro663f6c22001-01-20 15:59:25 +0000194
195 def allowance(self, filename):
196 """Preconditions:
197 - our agent applies to this entry
198 - filename is URL decoded"""
199 for line in self.rulelines:
200 if line.applies_to(filename):
201 return line.allowance
Martin v. Löwis31bd5292004-08-23 20:42:35 +0000202 return True