blob: a7137a3064ff3cd133894e1b9f9b945daf1f1790 [file] [log] [blame]
Skip Montanaro663f6c22001-01-20 15:59:25 +00001""" robotparser.py
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +00002
Skip Montanaro663f6c22001-01-20 15:59:25 +00003 Copyright (C) 2000 Bastian Kleineidam
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +00004
Skip Montanaro663f6c22001-01-20 15:59:25 +00005 You can choose between two licenses when using this package:
6 1) GNU GPLv2
Martin v. Löwisd22368f2002-03-18 10:41:20 +00007 2) PSF license for Python 2.2
Skip Montanaro663f6c22001-01-20 15:59:25 +00008
9 The robots.txt Exclusion Protocol is implemented as specified in
Raymond Hettingera5413c42014-05-12 22:18:50 -070010 http://www.robotstxt.org/norobots-rfc.txt
11
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000012"""
Skip Montanarob8bdbc02008-04-28 03:27:53 +000013import urlparse
14import urllib
Skip Montanaro663f6c22001-01-20 15:59:25 +000015
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000016__all__ = ["RobotFileParser"]
17
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000018
19class RobotFileParser:
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000020 """ This class provides a set of methods to read, parse and answer
21 questions about a single robots.txt file.
22
23 """
24
Skip Montanaro663f6c22001-01-20 15:59:25 +000025 def __init__(self, url=''):
26 self.entries = []
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +000027 self.default_entry = None
Martin v. Löwis31bd5292004-08-23 20:42:35 +000028 self.disallow_all = False
29 self.allow_all = False
Skip Montanaro663f6c22001-01-20 15:59:25 +000030 self.set_url(url)
Guido van Rossum986abac1998-04-06 14:29:28 +000031 self.last_checked = 0
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000032
33 def mtime(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000034 """Returns the time the robots.txt file was last fetched.
35
36 This is useful for long-running web spiders that need to
37 check for new robots.txt files periodically.
38
39 """
Guido van Rossum986abac1998-04-06 14:29:28 +000040 return self.last_checked
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000041
42 def modified(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000043 """Sets the time the robots.txt file was last fetched to the
44 current time.
45
46 """
Guido van Rossum986abac1998-04-06 14:29:28 +000047 import time
48 self.last_checked = time.time()
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000049
50 def set_url(self, url):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000051 """Sets the URL referring to a robots.txt file."""
Guido van Rossum986abac1998-04-06 14:29:28 +000052 self.url = url
Skip Montanaro663f6c22001-01-20 15:59:25 +000053 self.host, self.path = urlparse.urlparse(url)[1:3]
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000054
55 def read(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000056 """Reads the robots.txt URL and feeds it to the parser."""
Skip Montanaro5bba2312001-02-12 20:58:30 +000057 opener = URLopener()
58 f = opener.open(self.url)
Benjamin Peterson0522a9f2008-07-12 23:41:19 +000059 lines = [line.strip() for line in f]
60 f.close()
Skip Montanaro5bba2312001-02-12 20:58:30 +000061 self.errcode = opener.errcode
Georg Brandl4ffc8f52007-03-13 09:41:31 +000062 if self.errcode in (401, 403):
Martin v. Löwis31bd5292004-08-23 20:42:35 +000063 self.disallow_all = True
Raymond Hettingera5413c42014-05-12 22:18:50 -070064 elif self.errcode >= 400 and self.errcode < 500:
Martin v. Löwis31bd5292004-08-23 20:42:35 +000065 self.allow_all = True
Skip Montanaro5bba2312001-02-12 20:58:30 +000066 elif self.errcode == 200 and lines:
Skip Montanaro5bba2312001-02-12 20:58:30 +000067 self.parse(lines)
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000068
Martin v. Löwis73f570b2002-03-18 10:43:18 +000069 def _add_entry(self, entry):
70 if "*" in entry.useragents:
71 # the default entry is considered last
Georg Brandl2bd953e2010-08-01 20:59:03 +000072 if self.default_entry is None:
73 # the first default entry wins
74 self.default_entry = entry
Martin v. Löwis73f570b2002-03-18 10:43:18 +000075 else:
76 self.entries.append(entry)
77
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +000078 def parse(self, lines):
Raymond Hettinger2d95f1a2004-03-13 20:27:23 +000079 """parse the input lines from a robots.txt file.
Tim Petersdfc538a2001-01-21 04:49:16 +000080 We allow that a user-agent: line is not preceded by
81 one or more blank lines."""
Skip Montanaro1ef19f02008-07-27 00:49:02 +000082 # states:
83 # 0: start state
84 # 1: saw user-agent line
85 # 2: saw an allow or disallow line
Skip Montanaro663f6c22001-01-20 15:59:25 +000086 state = 0
87 linenumber = 0
88 entry = Entry()
Tim Petersdfc538a2001-01-21 04:49:16 +000089
Raymond Hettingera5413c42014-05-12 22:18:50 -070090 self.modified()
Guido van Rossum986abac1998-04-06 14:29:28 +000091 for line in lines:
Benjamin Peterson0522a9f2008-07-12 23:41:19 +000092 linenumber += 1
Skip Montanaro663f6c22001-01-20 15:59:25 +000093 if not line:
Skip Montanarob8bdbc02008-04-28 03:27:53 +000094 if state == 1:
Skip Montanaro663f6c22001-01-20 15:59:25 +000095 entry = Entry()
96 state = 0
Skip Montanarob8bdbc02008-04-28 03:27:53 +000097 elif state == 2:
Martin v. Löwis73f570b2002-03-18 10:43:18 +000098 self._add_entry(entry)
Skip Montanaro663f6c22001-01-20 15:59:25 +000099 entry = Entry()
100 state = 0
Guido van Rossum986abac1998-04-06 14:29:28 +0000101 # remove optional comment and strip line
Eric S. Raymond141971f2001-02-09 08:40:40 +0000102 i = line.find('#')
Skip Montanarob8bdbc02008-04-28 03:27:53 +0000103 if i >= 0:
Skip Montanaro663f6c22001-01-20 15:59:25 +0000104 line = line[:i]
Eric S. Raymond141971f2001-02-09 08:40:40 +0000105 line = line.strip()
Guido van Rossum986abac1998-04-06 14:29:28 +0000106 if not line:
107 continue
Eric S. Raymond141971f2001-02-09 08:40:40 +0000108 line = line.split(':', 1)
Guido van Rossum986abac1998-04-06 14:29:28 +0000109 if len(line) == 2:
Eric S. Raymond141971f2001-02-09 08:40:40 +0000110 line[0] = line[0].strip().lower()
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000111 line[1] = urllib.unquote(line[1].strip())
Skip Montanaro663f6c22001-01-20 15:59:25 +0000112 if line[0] == "user-agent":
Skip Montanarob8bdbc02008-04-28 03:27:53 +0000113 if state == 2:
Martin v. Löwis73f570b2002-03-18 10:43:18 +0000114 self._add_entry(entry)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000115 entry = Entry()
116 entry.useragents.append(line[1])
117 state = 1
118 elif line[0] == "disallow":
Skip Montanarob8bdbc02008-04-28 03:27:53 +0000119 if state != 0:
Martin v. Löwis31bd5292004-08-23 20:42:35 +0000120 entry.rulelines.append(RuleLine(line[1], False))
Skip Montanaro663f6c22001-01-20 15:59:25 +0000121 state = 2
122 elif line[0] == "allow":
Skip Montanarob8bdbc02008-04-28 03:27:53 +0000123 if state != 0:
Martin v. Löwis31bd5292004-08-23 20:42:35 +0000124 entry.rulelines.append(RuleLine(line[1], True))
Skip Montanaro1ef19f02008-07-27 00:49:02 +0000125 state = 2
Skip Montanarob8bdbc02008-04-28 03:27:53 +0000126 if state == 2:
Georg Brandl2bd953e2010-08-01 20:59:03 +0000127 self._add_entry(entry)
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000128
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000129
Guido van Rossumdc8b7982000-03-27 19:29:31 +0000130 def can_fetch(self, useragent, url):
131 """using the parsed robots.txt decide if useragent can fetch url"""
Skip Montanaro663f6c22001-01-20 15:59:25 +0000132 if self.disallow_all:
Tim Petersbc0e9102002-04-04 22:55:58 +0000133 return False
Skip Montanaro663f6c22001-01-20 15:59:25 +0000134 if self.allow_all:
Tim Petersbc0e9102002-04-04 22:55:58 +0000135 return True
Raymond Hettingera5413c42014-05-12 22:18:50 -0700136
137 # Until the robots.txt file has been read or found not
138 # to exist, we must assume that no url is allowable.
Martin Panterf2f1c572016-05-08 13:18:25 +0000139 # This prevents false positives when a user erroneously
Raymond Hettingera5413c42014-05-12 22:18:50 -0700140 # calls can_fetch() before calling read().
141 if not self.last_checked:
142 return False
143
Skip Montanaro663f6c22001-01-20 15:59:25 +0000144 # search for given user agent matches
145 # the first match counts
Senthil Kumarana4f79f92010-07-28 16:35:35 +0000146 parsed_url = urlparse.urlparse(urllib.unquote(url))
147 url = urlparse.urlunparse(('', '', parsed_url.path,
148 parsed_url.params, parsed_url.query, parsed_url.fragment))
149 url = urllib.quote(url)
150 if not url:
151 url = "/"
Skip Montanaro663f6c22001-01-20 15:59:25 +0000152 for entry in self.entries:
153 if entry.applies_to(useragent):
154 return entry.allowance(url)
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000155 # try the default entry last
156 if self.default_entry:
157 return self.default_entry.allowance(url)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000158 # agent not found ==> access granted
Tim Petersbc0e9102002-04-04 22:55:58 +0000159 return True
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000160
Guido van Rossumbbf8c2f1997-01-30 03:18:23 +0000161
Skip Montanaro663f6c22001-01-20 15:59:25 +0000162 def __str__(self):
Georg Brandl4ffc8f52007-03-13 09:41:31 +0000163 return ''.join([str(entry) + "\n" for entry in self.entries])
Skip Montanaro663f6c22001-01-20 15:59:25 +0000164
165
166class RuleLine:
Martin v. Löwis31bd5292004-08-23 20:42:35 +0000167 """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
168 (allowance==False) followed by a path."""
Skip Montanaro663f6c22001-01-20 15:59:25 +0000169 def __init__(self, path, allowance):
Martin v. Löwis1c63f6e2002-02-28 15:24:47 +0000170 if path == '' and not allowance:
171 # an empty value means allow all
Martin v. Löwis31bd5292004-08-23 20:42:35 +0000172 allowance = True
Senthil Kumaran2c4810e2013-05-29 05:58:47 -0700173 path = urlparse.urlunparse(urlparse.urlparse(path))
Skip Montanaro663f6c22001-01-20 15:59:25 +0000174 self.path = urllib.quote(path)
175 self.allowance = allowance
176
177 def applies_to(self, filename):
Skip Montanarob8bdbc02008-04-28 03:27:53 +0000178 return self.path == "*" or filename.startswith(self.path)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000179
180 def __str__(self):
Skip Montanarob8bdbc02008-04-28 03:27:53 +0000181 return (self.allowance and "Allow" or "Disallow") + ": " + self.path
Skip Montanaro663f6c22001-01-20 15:59:25 +0000182
183
184class Entry:
185 """An entry has one or more user-agents and zero or more rulelines"""
186 def __init__(self):
187 self.useragents = []
188 self.rulelines = []
189
190 def __str__(self):
Georg Brandl4ffc8f52007-03-13 09:41:31 +0000191 ret = []
Skip Montanaro663f6c22001-01-20 15:59:25 +0000192 for agent in self.useragents:
Georg Brandl4ffc8f52007-03-13 09:41:31 +0000193 ret.extend(["User-agent: ", agent, "\n"])
Skip Montanaro663f6c22001-01-20 15:59:25 +0000194 for line in self.rulelines:
Georg Brandl4ffc8f52007-03-13 09:41:31 +0000195 ret.extend([str(line), "\n"])
196 return ''.join(ret)
Skip Montanaro663f6c22001-01-20 15:59:25 +0000197
198 def applies_to(self, useragent):
Skip Montanaro5bba2312001-02-12 20:58:30 +0000199 """check if this entry applies to the specified agent"""
200 # split the name token and make it lower case
201 useragent = useragent.split("/")[0].lower()
Skip Montanaro663f6c22001-01-20 15:59:25 +0000202 for agent in self.useragents:
Skip Montanarob8bdbc02008-04-28 03:27:53 +0000203 if agent == '*':
Skip Montanaro5bba2312001-02-12 20:58:30 +0000204 # we have the catch-all agent
Tim Petersbc0e9102002-04-04 22:55:58 +0000205 return True
Skip Montanaro5bba2312001-02-12 20:58:30 +0000206 agent = agent.lower()
Raymond Hettingerbac788a2004-05-04 09:21:43 +0000207 if agent in useragent:
Tim Petersbc0e9102002-04-04 22:55:58 +0000208 return True
209 return False
Skip Montanaro663f6c22001-01-20 15:59:25 +0000210
211 def allowance(self, filename):
212 """Preconditions:
213 - our agent applies to this entry
214 - filename is URL decoded"""
215 for line in self.rulelines:
216 if line.applies_to(filename):
217 return line.allowance
Martin v. Löwis31bd5292004-08-23 20:42:35 +0000218 return True
Skip Montanaro663f6c22001-01-20 15:59:25 +0000219
Skip Montanaro5bba2312001-02-12 20:58:30 +0000220class URLopener(urllib.FancyURLopener):
221 def __init__(self, *args):
Guido van Rossum68468eb2003-02-27 20:14:51 +0000222 urllib.FancyURLopener.__init__(self, *args)
Skip Montanaro5bba2312001-02-12 20:58:30 +0000223 self.errcode = 200
Tim Peters0e6d2132001-02-15 23:56:39 +0000224
Skip Montanaro1a413132007-08-28 23:22:52 +0000225 def prompt_user_passwd(self, host, realm):
226 ## If robots.txt file is accessible only with a password,
227 ## we act as if the file wasn't there.
228 return None, None
229
Skip Montanaro5bba2312001-02-12 20:58:30 +0000230 def http_error_default(self, url, fp, errcode, errmsg, headers):
231 self.errcode = errcode
232 return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
233 errmsg, headers)