blob: daac29c68dc36d2c771ae2a8f13b07aa36a4b2c9 [file] [log] [blame]
Skip Montanaro663f6c22001-01-20 15:59:25 +00001""" robotparser.py
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +00002
Skip Montanaro663f6c22001-01-20 15:59:25 +00003 Copyright (C) 2000 Bastian Kleineidam
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +00004
Skip Montanaro663f6c22001-01-20 15:59:25 +00005 You can choose between two licenses when using this package:
6 1) GNU GPLv2
Martin v. Löwisd22368f2002-03-18 10:41:20 +00007 2) PSF license for Python 2.2
Skip Montanaro663f6c22001-01-20 15:59:25 +00008
9 The robots.txt Exclusion Protocol is implemented as specified in
Raymond Hettinger122541b2014-05-12 21:56:33 -070010 http://www.robotstxt.org/norobots-rfc.txt
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000011"""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000012
Berker Peksag960e8482015-10-08 12:27:06 +030013import collections
14import urllib.parse
15import urllib.request
Skip Montanaro663f6c22001-01-20 15:59:25 +000016
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000017__all__ = ["RobotFileParser"]
18
Berker Peksag3df02db2017-11-24 02:40:26 +030019RequestRate = collections.namedtuple("RequestRate", "requests seconds")
20
21
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000022class RobotFileParser:
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000023 """ This class provides a set of methods to read, parse and answer
24 questions about a single robots.txt file.
25
26 """
27
Skip Montanaro663f6c22001-01-20 15:59:25 +000028 def __init__(self, url=''):
29 self.entries = []
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000030 self.default_entry = None
Martin v. Löwis31bd5292004-08-23 20:42:35 +000031 self.disallow_all = False
32 self.allow_all = False
Skip Montanaro663f6c22001-01-20 15:59:25 +000033 self.set_url(url)
Guido van Rossum986abac1998-04-06 14:29:28 +000034 self.last_checked = 0
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000035
36 def mtime(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000037 """Returns the time the robots.txt file was last fetched.
38
39 This is useful for long-running web spiders that need to
40 check for new robots.txt files periodically.
41
42 """
Guido van Rossum986abac1998-04-06 14:29:28 +000043 return self.last_checked
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000044
45 def modified(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000046 """Sets the time the robots.txt file was last fetched to the
47 current time.
48
49 """
Guido van Rossum986abac1998-04-06 14:29:28 +000050 import time
51 self.last_checked = time.time()
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000052
53 def set_url(self, url):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000054 """Sets the URL referring to a robots.txt file."""
Guido van Rossum986abac1998-04-06 14:29:28 +000055 self.url = url
Jeremy Hylton1afc1692008-06-18 20:49:58 +000056 self.host, self.path = urllib.parse.urlparse(url)[1:3]
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000057
58 def read(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000059 """Reads the robots.txt URL and feeds it to the parser."""
Jeremy Hylton1afc1692008-06-18 20:49:58 +000060 try:
61 f = urllib.request.urlopen(self.url)
62 except urllib.error.HTTPError as err:
63 if err.code in (401, 403):
64 self.disallow_all = True
Raymond Hettinger122541b2014-05-12 21:56:33 -070065 elif err.code >= 400 and err.code < 500:
Jeremy Hylton1afc1692008-06-18 20:49:58 +000066 self.allow_all = True
67 else:
Jeremy Hylton73fd46d2008-07-18 20:59:44 +000068 raw = f.read()
69 self.parse(raw.decode("utf-8").splitlines())
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000070
Martin v. Löwis73f570b2002-03-18 10:43:18 +000071 def _add_entry(self, entry):
72 if "*" in entry.useragents:
73 # the default entry is considered last
Georg Brandl0a0fc072010-07-29 17:55:01 +000074 if self.default_entry is None:
75 # the first default entry wins
76 self.default_entry = entry
Martin v. Löwis73f570b2002-03-18 10:43:18 +000077 else:
78 self.entries.append(entry)
79
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000080 def parse(self, lines):
Jeremy Hylton1afc1692008-06-18 20:49:58 +000081 """Parse the input lines from a robots.txt file.
82
83 We allow that a user-agent: line is not preceded by
84 one or more blank lines.
85 """
Benjamin Petersond6313712008-07-31 16:23:04 +000086 # states:
87 # 0: start state
88 # 1: saw user-agent line
89 # 2: saw an allow or disallow line
Skip Montanaro663f6c22001-01-20 15:59:25 +000090 state = 0
Skip Montanaro663f6c22001-01-20 15:59:25 +000091 entry = Entry()
Tim Petersdfc538a2001-01-21 04:49:16 +000092
Raymond Hettinger122541b2014-05-12 21:56:33 -070093 self.modified()
Guido van Rossum986abac1998-04-06 14:29:28 +000094 for line in lines:
Skip Montanaro663f6c22001-01-20 15:59:25 +000095 if not line:
Christian Heimes81ee3ef2008-05-04 22:42:01 +000096 if state == 1:
Skip Montanaro663f6c22001-01-20 15:59:25 +000097 entry = Entry()
98 state = 0
Christian Heimes81ee3ef2008-05-04 22:42:01 +000099 elif state == 2:
Martin v. Löwis73f570b2002-03-18 10:43:18 +0000100 self._add_entry(entry)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000101 entry = Entry()
102 state = 0
Guido van Rossum986abac1998-04-06 14:29:28 +0000103 # remove optional comment and strip line
Eric S. Raymond141971f2001-02-09 08:40:40 +0000104 i = line.find('#')
Christian Heimes81ee3ef2008-05-04 22:42:01 +0000105 if i >= 0:
Skip Montanaro663f6c22001-01-20 15:59:25 +0000106 line = line[:i]
Eric S. Raymond141971f2001-02-09 08:40:40 +0000107 line = line.strip()
Guido van Rossum986abac1998-04-06 14:29:28 +0000108 if not line:
109 continue
Eric S. Raymond141971f2001-02-09 08:40:40 +0000110 line = line.split(':', 1)
Guido van Rossum986abac1998-04-06 14:29:28 +0000111 if len(line) == 2:
Eric S. Raymond141971f2001-02-09 08:40:40 +0000112 line[0] = line[0].strip().lower()
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000113 line[1] = urllib.parse.unquote(line[1].strip())
Skip Montanaro663f6c22001-01-20 15:59:25 +0000114 if line[0] == "user-agent":
Christian Heimes81ee3ef2008-05-04 22:42:01 +0000115 if state == 2:
Martin v. Löwis73f570b2002-03-18 10:43:18 +0000116 self._add_entry(entry)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000117 entry = Entry()
118 entry.useragents.append(line[1])
119 state = 1
120 elif line[0] == "disallow":
Christian Heimes81ee3ef2008-05-04 22:42:01 +0000121 if state != 0:
Martin v. Löwis31bd5292004-08-23 20:42:35 +0000122 entry.rulelines.append(RuleLine(line[1], False))
Skip Montanaro663f6c22001-01-20 15:59:25 +0000123 state = 2
124 elif line[0] == "allow":
Christian Heimes81ee3ef2008-05-04 22:42:01 +0000125 if state != 0:
Martin v. Löwis31bd5292004-08-23 20:42:35 +0000126 entry.rulelines.append(RuleLine(line[1], True))
Benjamin Petersond6313712008-07-31 16:23:04 +0000127 state = 2
Berker Peksag960e8482015-10-08 12:27:06 +0300128 elif line[0] == "crawl-delay":
129 if state != 0:
130 # before trying to convert to int we need to make
131 # sure that robots.txt has valid syntax otherwise
132 # it will crash
133 if line[1].strip().isdigit():
134 entry.delay = int(line[1])
135 state = 2
136 elif line[0] == "request-rate":
137 if state != 0:
138 numbers = line[1].split('/')
139 # check if all values are sane
140 if (len(numbers) == 2 and numbers[0].strip().isdigit()
141 and numbers[1].strip().isdigit()):
Berker Peksag3df02db2017-11-24 02:40:26 +0300142 entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
Berker Peksag960e8482015-10-08 12:27:06 +0300143 state = 2
Christian Heimes81ee3ef2008-05-04 22:42:01 +0000144 if state == 2:
Georg Brandl0a0fc072010-07-29 17:55:01 +0000145 self._add_entry(entry)
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000146
Guido van Rossumdc8b7982000-03-27 19:29:31 +0000147 def can_fetch(self, useragent, url):
148 """using the parsed robots.txt decide if useragent can fetch url"""
Skip Montanaro663f6c22001-01-20 15:59:25 +0000149 if self.disallow_all:
Tim Petersbc0e9102002-04-04 22:55:58 +0000150 return False
Skip Montanaro663f6c22001-01-20 15:59:25 +0000151 if self.allow_all:
Tim Petersbc0e9102002-04-04 22:55:58 +0000152 return True
Raymond Hettinger122541b2014-05-12 21:56:33 -0700153 # Until the robots.txt file has been read or found not
154 # to exist, we must assume that no url is allowable.
Martin Panterf05641642016-05-08 13:48:10 +0000155 # This prevents false positives when a user erroneously
Raymond Hettinger122541b2014-05-12 21:56:33 -0700156 # calls can_fetch() before calling read().
157 if not self.last_checked:
158 return False
Skip Montanaro663f6c22001-01-20 15:59:25 +0000159 # search for given user agent matches
160 # the first match counts
Senthil Kumaran3f8ab962010-07-28 16:27:56 +0000161 parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
162 url = urllib.parse.urlunparse(('','',parsed_url.path,
163 parsed_url.params,parsed_url.query, parsed_url.fragment))
164 url = urllib.parse.quote(url)
Jeremy Hylton73fd46d2008-07-18 20:59:44 +0000165 if not url:
166 url = "/"
Skip Montanaro663f6c22001-01-20 15:59:25 +0000167 for entry in self.entries:
168 if entry.applies_to(useragent):
169 return entry.allowance(url)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000170 # try the default entry last
171 if self.default_entry:
172 return self.default_entry.allowance(url)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000173 # agent not found ==> access granted
Tim Petersbc0e9102002-04-04 22:55:58 +0000174 return True
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000175
Berker Peksag960e8482015-10-08 12:27:06 +0300176 def crawl_delay(self, useragent):
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300177 if not self.mtime():
178 return None
Berker Peksag960e8482015-10-08 12:27:06 +0300179 for entry in self.entries:
180 if entry.applies_to(useragent):
181 return entry.delay
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300182 return self.default_entry.delay
Berker Peksag960e8482015-10-08 12:27:06 +0300183
184 def request_rate(self, useragent):
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300185 if not self.mtime():
186 return None
Berker Peksag960e8482015-10-08 12:27:06 +0300187 for entry in self.entries:
188 if entry.applies_to(useragent):
189 return entry.req_rate
Berker Peksag9a7bbb22016-09-18 20:17:58 +0300190 return self.default_entry.req_rate
Berker Peksag960e8482015-10-08 12:27:06 +0300191
Skip Montanaro663f6c22001-01-20 15:59:25 +0000192 def __str__(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000193 return ''.join([str(entry) + "\n" for entry in self.entries])
Skip Montanaro663f6c22001-01-20 15:59:25 +0000194
195
196class RuleLine:
Martin v. Löwis31bd5292004-08-23 20:42:35 +0000197 """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
198 (allowance==False) followed by a path."""
Skip Montanaro663f6c22001-01-20 15:59:25 +0000199 def __init__(self, path, allowance):
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000200 if path == '' and not allowance:
201 # an empty value means allow all
Martin v. Löwis31bd5292004-08-23 20:42:35 +0000202 allowance = True
Senthil Kumaranc70a6ae2013-05-29 05:54:31 -0700203 path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000204 self.path = urllib.parse.quote(path)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000205 self.allowance = allowance
206
207 def applies_to(self, filename):
Christian Heimes81ee3ef2008-05-04 22:42:01 +0000208 return self.path == "*" or filename.startswith(self.path)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000209
210 def __str__(self):
Raymond Hettinger38acd4c2014-05-12 22:22:46 -0700211 return ("Allow" if self.allowance else "Disallow") + ": " + self.path
Skip Montanaro663f6c22001-01-20 15:59:25 +0000212
213
214class Entry:
215 """An entry has one or more user-agents and zero or more rulelines"""
216 def __init__(self):
217 self.useragents = []
218 self.rulelines = []
Berker Peksag960e8482015-10-08 12:27:06 +0300219 self.delay = None
220 self.req_rate = None
Skip Montanaro663f6c22001-01-20 15:59:25 +0000221
222 def __str__(self):
Guido van Rossumd8faa362007-04-27 19:54:29 +0000223 ret = []
Skip Montanaro663f6c22001-01-20 15:59:25 +0000224 for agent in self.useragents:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000225 ret.extend(["User-agent: ", agent, "\n"])
Skip Montanaro663f6c22001-01-20 15:59:25 +0000226 for line in self.rulelines:
Guido van Rossumd8faa362007-04-27 19:54:29 +0000227 ret.extend([str(line), "\n"])
228 return ''.join(ret)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000229
230 def applies_to(self, useragent):
Skip Montanaro5bba2312001-02-12 20:58:30 +0000231 """check if this entry applies to the specified agent"""
232 # split the name token and make it lower case
233 useragent = useragent.split("/")[0].lower()
Skip Montanaro663f6c22001-01-20 15:59:25 +0000234 for agent in self.useragents:
Christian Heimes81ee3ef2008-05-04 22:42:01 +0000235 if agent == '*':
Skip Montanaro5bba2312001-02-12 20:58:30 +0000236 # we have the catch-all agent
Tim Petersbc0e9102002-04-04 22:55:58 +0000237 return True
Skip Montanaro5bba2312001-02-12 20:58:30 +0000238 agent = agent.lower()
Raymond Hettingerbac788a2004-05-04 09:21:43 +0000239 if agent in useragent:
Tim Petersbc0e9102002-04-04 22:55:58 +0000240 return True
241 return False
Skip Montanaro663f6c22001-01-20 15:59:25 +0000242
243 def allowance(self, filename):
244 """Preconditions:
245 - our agent applies to this entry
246 - filename is URL decoded"""
247 for line in self.rulelines:
248 if line.applies_to(filename):
249 return line.allowance
Martin v. Löwis31bd5292004-08-23 20:42:35 +0000250 return True