Blame - Tools/webchecker/webchecker.py - platform/external/python/cpython3

blob: 255c490a65dfda0858845fe4df84d228cecfe750 [file] [log] [blame]

Guido van Rossum	272b37d	1997-01-30 02:44:48 +0000	[diff] [blame^]	1	#! /usr/bin/env python
				2
				3	"""Web tree checker.
				4
				5	This utility is handy to check a subweb of the world-wide web for
				6	errors. A subweb is specified by giving one or more ``root URLs''; a
				7	page belongs to the subweb if one of the root URLs is an initial
				8	prefix of it.
				9
				10	File URL extension:
				11
				12	In order to easy the checking of subwebs via the local file system,
				13	the interpretation of ``file:'' URLs is extended to mimic the behavior
				14	of your average HTTP daemon: if a directory pathname is given, the
				15	file index.html in that directory is returned if it exists, otherwise
				16	a directory listing is returned. Now, you can point webchecker to the
				17	document tree in the local file system of your HTTP daemon, and have
				18	most of it checked. In fact the default works this way if your local
				19	web tree is located at /usr/local/etc/httpd/htdpcs (the default for
				20	the NCSA HTTP daemon and probably others).
				21
				22	Reports printed:
				23
				24	When done, it reports links to pages outside the web (unless -q is
				25	specified), and pages with bad links within the subweb. When
				26	interrupted, it print those same reports for the pages that it has
				27	checked already.
				28
				29	In verbose mode, additional messages are printed during the
				30	information gathering phase. By default, it prints a summary of its
				31	work status every 50 URLs (adjustable with the -r option), and it
				32	reports errors as they are encountered. Use the -q option to disable
				33	this output.
				34
				35	Checkpoint feature:
				36
				37	Whether interrupted or not, it dumps its state (a Python pickle) to a
				38	checkpoint file and the -R option allows it to restart from the
				39	checkpoint (assuming that the pages on the subweb that were already
				40	processed haven't changed). Even when it has run till completion, -R
				41	can still be useful -- it will print the reports again, and -Rq prints
				42	the errors only. In this case, the checkpoint file is not written
				43	again. The checkpoint file can be set with the -d option.
				44
				45	The checkpoint file is written as a Python pickle. Remember that
				46	Python's pickle module is currently quite slow. Give it the time it
				47	needs to load and save the checkpoint file. When interrupted while
				48	writing the checkpoint file, the old checkpoint file is not
				49	overwritten, but all work done in the current run is lost.
				50
				51	Miscellaneous:
				52
				53	- Because the HTML parser is a bit slow, very large HTML files are
				54	skipped. The size limit can be set with the -m option.
				55
				56	- Before fetching a page, it guesses its type based on its extension.
				57	If it is a known extension and the type is not text/http, the page is
				58	not fetched. This is a huge optimization but occasionally it means
				59	links can be missed. The mimetypes.py module (also in this directory)
				60	has a built-in table mapping most currently known suffixes, and in
				61	addition attempts to read the mime.types configuration files in the
				62	default locations of Netscape and the NCSA HTTP daemon.
				63
				64	- It only follows links indicated by <A> tags. It doesn't follow
				65	links in <FORM> or <IMG> or whatever other tags might contain
				66	hyperlinks. It does honor the <BASE> tag.
				67
				68	- It could be argued that it should also check external links for
				69	validity. This is true, but is is more error-prone. I think I will
				70	make this an option in the future.
				71
				72
				73	Usage: webchecker.py [option] ... [rooturl] ...
				74
				75	Options:
				76
				77	-R -- restart from checkpoint file
				78	-d file -- checkpoint filename (default %(DUMPFILE)s)
				79	-m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d)
				80	-q -- quiet operation (also suppresses external links report)
				81	-r number -- number of links processed per round (default %(ROUNDSIZE)d)
				82	-v -- verbose operation; repeating -v will increase verbosity
				83
				84	Arguments:
				85
				86	rooturl -- URL to start checking
				87	(default %(DEFROOT)s)
				88
				89	"""
				90
				91
				92	import sys
				93	import os
				94	from types import *
				95	import string
				96	import StringIO
				97	import getopt
				98	import pickle
				99
				100	import urllib
				101	import urlparse
				102	import htmllib
				103	import formatter
				104
				105	import mimetypes
				106
				107
				108	# Tunable parameters
				109	DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
				110	MAXPAGE = 50000 # Ignore files bigger than this
				111	ROUNDSIZE = 50 # Number of links processed per round
				112	DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
				113
				114
				115	# Global variables
				116	verbose = 1
				117	maxpage = MAXPAGE
				118	roundsize = ROUNDSIZE
				119
				120
				121	def main():
				122	global verbose, maxpage, roundsize
				123	dumpfile = DUMPFILE
				124	restart = 0
				125
				126	try:
				127	opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:qr:v')
				128	except getopt.error, msg:
				129	sys.stdout = sys.stderr
				130	print msg
				131	print __doc__ % globals()
				132	sys.exit(2)
				133	for o, a in opts:
				134	if o == '-R':
				135	restart = 1
				136	if o == '-d':
				137	dumpfile = a
				138	if o == '-m':
				139	maxpage = string.atoi(a)
				140	if o == '-q':
				141	verbose = 0
				142	if o == '-r':
				143	roundsize = string.atoi(a)
				144	if o == '-v':
				145	verbose = verbose + 1
				146
				147	if restart:
				148	if verbose > 0:
				149	print "Loading checkpoint from %s ..." % dumpfile
				150	f = open(dumpfile, "rb")
				151	c = pickle.load(f)
				152	f.close()
				153	if verbose > 0:
				154	print "Done."
				155	print "Root:", string.join(c.roots, "\n ")
				156	else:
				157	c = Checker()
				158	if not args:
				159	args.append(DEFROOT)
				160
				161	for arg in args:
				162	c.addroot(arg)
				163
				164	if not c.todo:
				165	needsave = 0
				166	else:
				167	needsave = 1
				168	try:
				169	c.run()
				170	except KeyboardInterrupt:
				171	if verbose > 0:
				172	print "[interrupted]"
				173	c.report()
				174	if not needsave:
				175	if verbose > 0:
				176	print
				177	print "No need to save checkpoint"
				178	elif dumpfile:
				179	if verbose > 0:
				180	print
				181	print "Saving checkpoint to %s ..." % dumpfile
				182	newfile = dumpfile + ".new"
				183	f = open(newfile, "wb")
				184	pickle.dump(c, f)
				185	f.flush()
				186	f.close()
				187	try:
				188	os.unlink(dumpfile)
				189	except os.error:
				190	pass
				191	os.rename(newfile, dumpfile)
				192	if verbose > 0:
				193	print "Done."
				194	if dumpfile == DUMPFILE:
				195	print "Use ``%s -R'' to restart." % sys.argv[0]
				196	else:
				197	print "Use ``%s -R -d %s'' to restart." % (sys.argv[0],
				198	dumpfile)
				199
				200
				201	class Checker:
				202
				203	def __init__(self):
				204	self.roots = []
				205	self.todo = {}
				206	self.done = {}
				207	self.ext = {}
				208	self.bad = {}
				209	self.urlopener = MyURLopener()
				210	self.round = 0
				211
				212	def addroot(self, root):
				213	if root not in self.roots:
				214	self.roots.append(root)
				215	self.todo[root] = []
				216
				217	def run(self):
				218	while self.todo:
				219	self.round = self.round + 1
				220	if verbose > 0:
				221	print
				222	print "Round", self.round,
				223	print "(%d to do, %d done, %d external, %d bad)" % (
				224	len(self.todo), len(self.done),
				225	len(self.ext), len(self.bad))
				226	print
				227	urls = self.todo.keys()[:roundsize]
				228	for url in urls:
				229	self.dopage(url)
				230	self.done[url] = self.todo[url]
				231	del self.todo[url]
				232
				233	def report(self):
				234	print
				235	if not self.todo: print "Final",
				236	else: print "Interim",
				237	print "Report (%d to do, %d done, %d external, %d bad)" % (
				238	len(self.todo), len(self.done),
				239	len(self.ext), len(self.bad))
				240	if verbose > 0:
				241	self.report_extrefs()
				242	# Report errors last because the output may get truncated
				243	self.report_errors()
				244
				245	def report_extrefs(self):
				246	if not self.ext:
				247	print
				248	print "No external URLs"
				249	return
				250	print
				251	print "External URLs:"
				252	print
				253	urls = self.ext.keys()
				254	urls.sort()
				255	for url in urls:
				256	show("HREF ", url, " from", self.ext[url])
				257
				258	def report_errors(self):
				259	if not self.bad:
				260	print
				261	print "No errors"
				262	return
				263	print
				264	print "Error Report:"
				265	urls = self.bad.keys()
				266	urls.sort()
				267	bysource = {}
				268	for url in urls:
				269	try:
				270	origins = self.done[url]
				271	except KeyError:
				272	origins = self.todo[url]
				273	for source, rawlink in origins:
				274	triple = url, rawlink, self.bad[url]
				275	try:
				276	bysource[source].append(triple)
				277	except KeyError:
				278	bysource[source] = [triple]
				279	sources = bysource.keys()
				280	sources.sort()
				281	for source in sources:
				282	triples = bysource[source]
				283	print
				284	if len(triples) > 1:
				285	print len(triples), "Errors in", source
				286	else:
				287	print "Error in", source
				288	for url, rawlink, msg in triples:
				289	print " HREF", url,
				290	if rawlink != url: print "(%s)" % rawlink,
				291	print
				292	print " msg", msg
				293
				294	def dopage(self, url):
				295	if verbose > 1:
				296	if verbose > 2:
				297	show("Page ", url, " from", self.todo[url])
				298	else:
				299	print "Page ", url
				300	page = self.getpage(url)
				301	if not page:
				302	return
				303	for info in page.getlinkinfos():
				304	link, rawlink = info
				305	origin = url, rawlink
				306	if not self.inroots(link):
				307	try:
				308	self.ext[link].append(origin)
				309	if verbose > 3:
				310	print " New ext link", link,
				311	if link != rawlink: print "(%s)" % rawlink,
				312	print
				313	except KeyError:
				314	if verbose > 3:
				315	print " Seen ext link", link,
				316	if link != rawlink: print "(%s)" % rawlink,
				317	print
				318	self.ext[link] = [origin]
				319	elif self.done.has_key(link):
				320	if verbose > 3:
				321	print " Done link", link
				322	self.done[link].append(origin)
				323	elif self.todo.has_key(link):
				324	if verbose > 3:
				325	print " Seen todo link", link
				326	self.todo[link].append(origin)
				327	else:
				328	if verbose > 3:
				329	print " New todo link", link
				330	self.todo[link] = [origin]
				331
				332	def inroots(self, url):
				333	for root in self.roots:
				334	if url[:len(root)] == root:
				335	return 1
				336	return 0
				337
				338	def getpage(self, url):
				339	ctype, encoding = mimetypes.guess_type(url)
				340	if encoding:
				341	if verbose > 2:
				342	print " Won't bother, URL suggests encoding %s" % `encoding`
				343	return None
				344	if ctype and ctype != 'text/html':
				345	if verbose > 2:
				346	print " Won't bother, URL suggests mime type %s" % `ctype`
				347	return None
				348	try:
				349	f = self.urlopener.open(url)
				350	except IOError, msg:
				351	if verbose > 0:
				352	print "Error ", msg
				353	if verbose > 0:
				354	show(" HREF ", url, " from", self.todo[url])
				355	self.bad[url] = msg
				356	return None
				357	nurl = f.geturl()
				358	info = f.info()
				359	if info.has_key('content-type'):
				360	ctype = string.lower(info['content-type'])
				361	if nurl != url:
				362	if verbose > 1:
				363	print "Redirected to", nurl
				364	if not ctype:
				365	ctype, encoding = mimetypes.guess_type(nurl)
				366	if ctype != 'text/html':
				367	f.close()
				368	if verbose > 2:
				369	print " Not HTML, mime type", ctype
				370	return None
				371	text = f.read()
				372	f.close()
				373	return Page(text, nurl)
				374
				375
				376	class Page:
				377
				378	def __init__(self, text, url):
				379	self.text = text
				380	self.url = url
				381
				382	def getlinkinfos(self):
				383	size = len(self.text)
				384	if size > maxpage:
				385	if verbose > 0:
				386	print "Skip huge file", self.url
				387	print " (%.0f Kbytes)" % (size*0.001)
				388	return []
				389	if verbose > 2:
				390	print " Parsing", self.url, "(%d bytes)" % size
				391	parser = MyHTMLParser(formatter.NullFormatter())
				392	parser.feed(self.text)
				393	parser.close()
				394	rawlinks = parser.getlinks()
				395	base = urlparse.urljoin(self.url, parser.getbase() or "")
				396	infos = []
				397	for rawlink in rawlinks:
				398	t = urlparse.urlparse(rawlink)
				399	t = t[:-1] + ('',)
				400	rawlink = urlparse.urlunparse(t)
				401	link = urlparse.urljoin(base, rawlink)
				402	infos.append((link, rawlink))
				403	return infos
				404
				405
				406	class MyStringIO(StringIO.StringIO):
				407
				408	def __init__(self, url, info):
				409	self.__url = url
				410	self.__info = info
				411	StringIO.StringIO.__init__(self)
				412
				413	def info(self):
				414	return self.__info
				415
				416	def geturl(self):
				417	return self.__url
				418
				419
				420	class MyURLopener(urllib.FancyURLopener):
				421
				422	http_error_default = urllib.URLopener.http_error_default
				423
				424	def open_file(self, url):
				425	path = urllib.url2pathname(urllib.unquote(url))
				426	if path[-1] != os.sep:
				427	url = url + '/'
				428	if os.path.isdir(path):
				429	indexpath = os.path.join(path, "index.html")
				430	if os.path.exists(indexpath):
				431	return self.open_file(url + "index.html")
				432	try:
				433	names = os.listdir(path)
				434	except os.error, msg:
				435	raise IOError, msg, sys.exc_traceback
				436	names.sort()
				437	s = MyStringIO("file:"+url, {'content-type': 'text/html'})
				438	s.write('<BASE HREF="file:%s">\n' %
				439	urllib.quote(os.path.join(path, "")))
				440	for name in names:
				441	q = urllib.quote(name)
				442	s.write('<A HREF="%s">%s</A>\n' % (q, q))
				443	s.seek(0)
				444	return s
				445	return urllib.FancyURLopener.open_file(self, path)
				446
				447
				448	class MyHTMLParser(htmllib.HTMLParser):
				449
				450	def __init__(*args):
				451	self = args[0]
				452	self.base = None
				453	self.links = []
				454	apply(htmllib.HTMLParser.__init__, args)
				455
				456	def start_a(self, attributes):
				457	for name, value in attributes:
				458	if name == 'href' and value and value not in self.links:
				459	self.links.append(string.strip(value))
				460
				461	def do_base(self, attributes):
				462	for name, value in attributes:
				463	if name == 'href' and value:
				464	if verbose > 1:
				465	print " Base", value
				466	self.base = value
				467
				468	def getlinks(self):
				469	return self.links
				470
				471	def getbase(self):
				472	return self.base
				473
				474
				475	def show(p1, link, p2, origins):
				476	print p1, link
				477	i = 0
				478	for source, rawlink in origins:
				479	i = i+1
				480	if i == 2:
				481	p2 = ' '*len(p2)
				482	print p2, source,
				483	if rawlink != link: print "(%s)" % rawlink,
				484	print
				485
				486
				487	if __name__ == '__main__':
				488	main()