Doc/lib/librobotparser.tex - platform/external/python/cpython3 - Gitiles

 \section{\module{robotparser} ---
          Parser for robots.txt}

 \declaremodule{standard}{robotparser}
 \modulesynopsis{Loads a \protect\file{robots.txt} file and
                 answers questions about fetchability of other URLs.}
 \sectionauthor{Skip Montanaro}{skip@mojam.com}

 \index{WWW}
 \index{World Wide Web}
 \index{URL}
 \index{robots.txt}

 This module provides a single class, \class{RobotFileParser}, which answers
 questions about whether or not a particular user agent can fetch a URL on
 the Web site that published the \file{robots.txt} file.  For more details on
 the structure of \file{robots.txt} files, see
 \url{http://info.webcrawler.com/mak/projects/robots/norobots.html}.

 \begin{classdesc}{RobotFileParser}{}

 This class provides a set of methods to read, parse and answer questions
 about a single \file{robots.txt} file.

 \begin{methoddesc}{set_url}{url}
 Sets the URL referring to a \file{robots.txt} file.
 \end{methoddesc}

 \begin{methoddesc}{read}{}
 Reads the \file{robots.txt} URL and feeds it to the parser.
 \end{methoddesc}

 \begin{methoddesc}{parse}{lines}
 Parses the lines argument.
 \end{methoddesc}

 \begin{methoddesc}{can_fetch}{useragent, url}
 Returns true if the \var{useragent} is allowed to fetch the \var{url}
 according to the rules contained in the parsed \file{robots.txt} file.
 \end{methoddesc}

 \begin{methoddesc}{mtime}{}
 Returns the time the \code{robots.txt} file was last fetched.  This is
 useful for long-running web spiders that need to check for new
 \code{robots.txt} files periodically.
 \end{methoddesc}

 \begin{methoddesc}{modified}{}
 Sets the time the \code{robots.txt} file was last fetched to the current
 time.
 \end{methoddesc}

 \end{classdesc}

 The following example demonstrates basic use of the RobotFileParser class.

 \begin{verbatim}
 >>> import robotparser
 >>> rp = robotparser.RobotFileParser()
 >>> rp.set_url("http://www.musi-cal.com/robots.txt")
 >>> rp.read()
 >>> rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco")
 0
 >>> rp.can_fetch("*", "http://www.musi-cal.com/")
 1
 \end{verbatim}
	\section{\module{robotparser} ---
	Parser for robots.txt}

	\declaremodule{standard}{robotparser}
	\modulesynopsis{Loads a \protect\file{robots.txt} file and
	answers questions about fetchability of other URLs.}
	\sectionauthor{Skip Montanaro}{skip@mojam.com}

	\index{WWW}
	\index{World Wide Web}
	\index{URL}
	\index{robots.txt}

	This module provides a single class, \class{RobotFileParser}, which answers
	questions about whether or not a particular user agent can fetch a URL on
	the Web site that published the \file{robots.txt} file. For more details on
	the structure of \file{robots.txt} files, see
	\url{http://info.webcrawler.com/mak/projects/robots/norobots.html}.

	\begin{classdesc}{RobotFileParser}{}

	This class provides a set of methods to read, parse and answer questions
	about a single \file{robots.txt} file.

	\begin{methoddesc}{set_url}{url}
	Sets the URL referring to a \file{robots.txt} file.
	\end{methoddesc}

	\begin{methoddesc}{read}{}
	Reads the \file{robots.txt} URL and feeds it to the parser.
	\end{methoddesc}

	\begin{methoddesc}{parse}{lines}
	Parses the lines argument.
	\end{methoddesc}

	\begin{methoddesc}{can_fetch}{useragent, url}
	Returns true if the \var{useragent} is allowed to fetch the \var{url}
	according to the rules contained in the parsed \file{robots.txt} file.
	\end{methoddesc}

	\begin{methoddesc}{mtime}{}
	Returns the time the \code{robots.txt} file was last fetched. This is
	useful for long-running web spiders that need to check for new
	\code{robots.txt} files periodically.
	\end{methoddesc}

	\begin{methoddesc}{modified}{}
	Sets the time the \code{robots.txt} file was last fetched to the current
	time.
	\end{methoddesc}

	\end{classdesc}

	The following example demonstrates basic use of the RobotFileParser class.

	\begin{verbatim}
	>>> import robotparser
	>>> rp = robotparser.RobotFileParser()
	>>> rp.set_url("http://www.musi-cal.com/robots.txt")
	>>> rp.read()
	>>> rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco")
	0
	>>> rp.can_fetch("*", "http://www.musi-cal.com/")
	1
	\end{verbatim}