Blame - Doc/lib/librobotparser.tex - platform/external/python/cpython3

blob: 0008dde5281eacb4886744da956ef4e4babee801 [file] [log] [blame]

Fred Drake	3c9f936	2000-03-31 17:51:10 +0000	[diff] [blame]	1	\section{\module{robotparser} ---
Fred Drake	40ee7ac	2000-04-28 18:17:23 +0000	[diff] [blame]	2	Parser for robots.txt}
Fred Drake	3c9f936	2000-03-31 17:51:10 +0000	[diff] [blame]	3
				4	\declaremodule{standard}{robotparser}
Fred Drake	1a670c8	2001-11-06 22:14:35 +0000	[diff] [blame]	5	\modulesynopsis{Loads a \protect\file{robots.txt} file and
				6	answers questions about fetchability of other URLs.}
Fred Drake	3c9f936	2000-03-31 17:51:10 +0000	[diff] [blame]	7	\sectionauthor{Skip Montanaro}{skip@mojam.com}
				8
				9	\index{WWW}
Fred Drake	8ee679f	2001-07-14 02:50:55 +0000	[diff] [blame]	10	\index{World Wide Web}
Fred Drake	3c9f936	2000-03-31 17:51:10 +0000	[diff] [blame]	11	\index{URL}
				12	\index{robots.txt}
				13
				14	This module provides a single class, \class{RobotFileParser}, which answers
				15	questions about whether or not a particular user agent can fetch a URL on
Fred Drake	8ee679f	2001-07-14 02:50:55 +0000	[diff] [blame]	16	the Web site that published the \file{robots.txt} file. For more details on
Fred Drake	3c9f936	2000-03-31 17:51:10 +0000	[diff] [blame]	17	the structure of \file{robots.txt} files, see
				18	\url{http://info.webcrawler.com/mak/projects/robots/norobots.html}.
				19
				20	\begin{classdesc}{RobotFileParser}{}
				21
				22	This class provides a set of methods to read, parse and answer questions
				23	about a single \file{robots.txt} file.
				24
				25	\begin{methoddesc}{set_url}{url}
				26	Sets the URL referring to a \file{robots.txt} file.
				27	\end{methoddesc}
				28
				29	\begin{methoddesc}{read}{}
				30	Reads the \file{robots.txt} URL and feeds it to the parser.
				31	\end{methoddesc}
				32
				33	\begin{methoddesc}{parse}{lines}
				34	Parses the lines argument.
				35	\end{methoddesc}
				36
				37	\begin{methoddesc}{can_fetch}{useragent, url}
				38	Returns true if the \var{useragent} is allowed to fetch the \var{url}
				39	according to the rules contained in the parsed \file{robots.txt} file.
				40	\end{methoddesc}
				41
				42	\begin{methoddesc}{mtime}{}
				43	Returns the time the \code{robots.txt} file was last fetched. This is
				44	useful for long-running web spiders that need to check for new
				45	\code{robots.txt} files periodically.
				46	\end{methoddesc}
				47
				48	\begin{methoddesc}{modified}{}
				49	Sets the time the \code{robots.txt} file was last fetched to the current
				50	time.
				51	\end{methoddesc}
				52
				53	\end{classdesc}
				54
				55	The following example demonstrates basic use of the RobotFileParser class.
				56
				57	\begin{verbatim}
				58	>>> import robotparser
				59	>>> rp = robotparser.RobotFileParser()
				60	>>> rp.set_url("http://www.musi-cal.com/robots.txt")
				61	>>> rp.read()
				62	>>> rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco")
				63	0
				64	>>> rp.can_fetch("*", "http://www.musi-cal.com/")
				65	1
				66	\end{verbatim}