blob: 5eac5283eea7492bbd72bce22fcc9f6e27d1a3b8 [file] [log] [blame]
Fred Drake3c9f9362000-03-31 17:51:10 +00001\section{\module{robotparser} ---
Fred Drake40ee7ac2000-04-28 18:17:23 +00002 Parser for robots.txt}
Fred Drake3c9f9362000-03-31 17:51:10 +00003
4\declaremodule{standard}{robotparser}
Fred Drake1a670c82001-11-06 22:14:35 +00005\modulesynopsis{Loads a \protect\file{robots.txt} file and
6 answers questions about fetchability of other URLs.}
Fred Drake3c9f9362000-03-31 17:51:10 +00007\sectionauthor{Skip Montanaro}{skip@mojam.com}
8
9\index{WWW}
Fred Drake8ee679f2001-07-14 02:50:55 +000010\index{World Wide Web}
Fred Drake3c9f9362000-03-31 17:51:10 +000011\index{URL}
12\index{robots.txt}
13
14This module provides a single class, \class{RobotFileParser}, which answers
15questions about whether or not a particular user agent can fetch a URL on
Fred Drake8ee679f2001-07-14 02:50:55 +000016the Web site that published the \file{robots.txt} file. For more details on
Fred Drake3c9f9362000-03-31 17:51:10 +000017the structure of \file{robots.txt} files, see
Skip Montanaro72331172003-07-14 17:04:50 +000018\url{http://www.robotstxt.org/wc/norobots.html}.
Fred Drake3c9f9362000-03-31 17:51:10 +000019
20\begin{classdesc}{RobotFileParser}{}
21
22This class provides a set of methods to read, parse and answer questions
23about a single \file{robots.txt} file.
24
25\begin{methoddesc}{set_url}{url}
26Sets the URL referring to a \file{robots.txt} file.
27\end{methoddesc}
28
29\begin{methoddesc}{read}{}
30Reads the \file{robots.txt} URL and feeds it to the parser.
31\end{methoddesc}
32
33\begin{methoddesc}{parse}{lines}
34Parses the lines argument.
35\end{methoddesc}
36
37\begin{methoddesc}{can_fetch}{useragent, url}
Neal Norwitzd3dab2b2002-04-05 02:21:09 +000038Returns \code{True} if the \var{useragent} is allowed to fetch the \var{url}
Fred Drake3c9f9362000-03-31 17:51:10 +000039according to the rules contained in the parsed \file{robots.txt} file.
40\end{methoddesc}
41
42\begin{methoddesc}{mtime}{}
43Returns the time the \code{robots.txt} file was last fetched. This is
44useful for long-running web spiders that need to check for new
45\code{robots.txt} files periodically.
46\end{methoddesc}
47
48\begin{methoddesc}{modified}{}
49Sets the time the \code{robots.txt} file was last fetched to the current
50time.
51\end{methoddesc}
52
53\end{classdesc}
54
55The following example demonstrates basic use of the RobotFileParser class.
56
57\begin{verbatim}
58>>> import robotparser
59>>> rp = robotparser.RobotFileParser()
60>>> rp.set_url("http://www.musi-cal.com/robots.txt")
61>>> rp.read()
62>>> rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco")
Neal Norwitzd3dab2b2002-04-05 02:21:09 +000063False
Fred Drake3c9f9362000-03-31 17:51:10 +000064>>> rp.can_fetch("*", "http://www.musi-cal.com/")
Neal Norwitzd3dab2b2002-04-05 02:21:09 +000065True
Fred Drake3c9f9362000-03-31 17:51:10 +000066\end{verbatim}