blob: 255c490a65dfda0858845fe4df84d228cecfe750 [file] [log] [blame]
Guido van Rossum272b37d1997-01-30 02:44:48 +00001#! /usr/bin/env python
2
3"""Web tree checker.
4
5This utility is handy to check a subweb of the world-wide web for
6errors. A subweb is specified by giving one or more ``root URLs''; a
7page belongs to the subweb if one of the root URLs is an initial
8prefix of it.
9
10File URL extension:
11
12In order to easy the checking of subwebs via the local file system,
13the interpretation of ``file:'' URLs is extended to mimic the behavior
14of your average HTTP daemon: if a directory pathname is given, the
15file index.html in that directory is returned if it exists, otherwise
16a directory listing is returned. Now, you can point webchecker to the
17document tree in the local file system of your HTTP daemon, and have
18most of it checked. In fact the default works this way if your local
19web tree is located at /usr/local/etc/httpd/htdpcs (the default for
20the NCSA HTTP daemon and probably others).
21
22Reports printed:
23
24When done, it reports links to pages outside the web (unless -q is
25specified), and pages with bad links within the subweb. When
26interrupted, it print those same reports for the pages that it has
27checked already.
28
29In verbose mode, additional messages are printed during the
30information gathering phase. By default, it prints a summary of its
31work status every 50 URLs (adjustable with the -r option), and it
32reports errors as they are encountered. Use the -q option to disable
33this output.
34
35Checkpoint feature:
36
37Whether interrupted or not, it dumps its state (a Python pickle) to a
38checkpoint file and the -R option allows it to restart from the
39checkpoint (assuming that the pages on the subweb that were already
40processed haven't changed). Even when it has run till completion, -R
41can still be useful -- it will print the reports again, and -Rq prints
42the errors only. In this case, the checkpoint file is not written
43again. The checkpoint file can be set with the -d option.
44
45The checkpoint file is written as a Python pickle. Remember that
46Python's pickle module is currently quite slow. Give it the time it
47needs to load and save the checkpoint file. When interrupted while
48writing the checkpoint file, the old checkpoint file is not
49overwritten, but all work done in the current run is lost.
50
51Miscellaneous:
52
53- Because the HTML parser is a bit slow, very large HTML files are
54 skipped. The size limit can be set with the -m option.
55
56- Before fetching a page, it guesses its type based on its extension.
57If it is a known extension and the type is not text/http, the page is
58not fetched. This is a huge optimization but occasionally it means
59links can be missed. The mimetypes.py module (also in this directory)
60has a built-in table mapping most currently known suffixes, and in
61addition attempts to read the mime.types configuration files in the
62default locations of Netscape and the NCSA HTTP daemon.
63
64- It only follows links indicated by <A> tags. It doesn't follow
65links in <FORM> or <IMG> or whatever other tags might contain
66hyperlinks. It does honor the <BASE> tag.
67
68- It could be argued that it should also check external links for
69validity. This is true, but is is more error-prone. I think I will
70make this an option in the future.
71
72
73Usage: webchecker.py [option] ... [rooturl] ...
74
75Options:
76
77-R -- restart from checkpoint file
78-d file -- checkpoint filename (default %(DUMPFILE)s)
79-m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d)
80-q -- quiet operation (also suppresses external links report)
81-r number -- number of links processed per round (default %(ROUNDSIZE)d)
82-v -- verbose operation; repeating -v will increase verbosity
83
84Arguments:
85
86rooturl -- URL to start checking
87 (default %(DEFROOT)s)
88
89"""
90
91
92import sys
93import os
94from types import *
95import string
96import StringIO
97import getopt
98import pickle
99
100import urllib
101import urlparse
102import htmllib
103import formatter
104
105import mimetypes
106
107
108# Tunable parameters
109DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
110MAXPAGE = 50000 # Ignore files bigger than this
111ROUNDSIZE = 50 # Number of links processed per round
112DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
113
114
115# Global variables
116verbose = 1
117maxpage = MAXPAGE
118roundsize = ROUNDSIZE
119
120
121def main():
122 global verbose, maxpage, roundsize
123 dumpfile = DUMPFILE
124 restart = 0
125
126 try:
127 opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:qr:v')
128 except getopt.error, msg:
129 sys.stdout = sys.stderr
130 print msg
131 print __doc__ % globals()
132 sys.exit(2)
133 for o, a in opts:
134 if o == '-R':
135 restart = 1
136 if o == '-d':
137 dumpfile = a
138 if o == '-m':
139 maxpage = string.atoi(a)
140 if o == '-q':
141 verbose = 0
142 if o == '-r':
143 roundsize = string.atoi(a)
144 if o == '-v':
145 verbose = verbose + 1
146
147 if restart:
148 if verbose > 0:
149 print "Loading checkpoint from %s ..." % dumpfile
150 f = open(dumpfile, "rb")
151 c = pickle.load(f)
152 f.close()
153 if verbose > 0:
154 print "Done."
155 print "Root:", string.join(c.roots, "\n ")
156 else:
157 c = Checker()
158 if not args:
159 args.append(DEFROOT)
160
161 for arg in args:
162 c.addroot(arg)
163
164 if not c.todo:
165 needsave = 0
166 else:
167 needsave = 1
168 try:
169 c.run()
170 except KeyboardInterrupt:
171 if verbose > 0:
172 print "[interrupted]"
173 c.report()
174 if not needsave:
175 if verbose > 0:
176 print
177 print "No need to save checkpoint"
178 elif dumpfile:
179 if verbose > 0:
180 print
181 print "Saving checkpoint to %s ..." % dumpfile
182 newfile = dumpfile + ".new"
183 f = open(newfile, "wb")
184 pickle.dump(c, f)
185 f.flush()
186 f.close()
187 try:
188 os.unlink(dumpfile)
189 except os.error:
190 pass
191 os.rename(newfile, dumpfile)
192 if verbose > 0:
193 print "Done."
194 if dumpfile == DUMPFILE:
195 print "Use ``%s -R'' to restart." % sys.argv[0]
196 else:
197 print "Use ``%s -R -d %s'' to restart." % (sys.argv[0],
198 dumpfile)
199
200
201class Checker:
202
203 def __init__(self):
204 self.roots = []
205 self.todo = {}
206 self.done = {}
207 self.ext = {}
208 self.bad = {}
209 self.urlopener = MyURLopener()
210 self.round = 0
211
212 def addroot(self, root):
213 if root not in self.roots:
214 self.roots.append(root)
215 self.todo[root] = []
216
217 def run(self):
218 while self.todo:
219 self.round = self.round + 1
220 if verbose > 0:
221 print
222 print "Round", self.round,
223 print "(%d to do, %d done, %d external, %d bad)" % (
224 len(self.todo), len(self.done),
225 len(self.ext), len(self.bad))
226 print
227 urls = self.todo.keys()[:roundsize]
228 for url in urls:
229 self.dopage(url)
230 self.done[url] = self.todo[url]
231 del self.todo[url]
232
233 def report(self):
234 print
235 if not self.todo: print "Final",
236 else: print "Interim",
237 print "Report (%d to do, %d done, %d external, %d bad)" % (
238 len(self.todo), len(self.done),
239 len(self.ext), len(self.bad))
240 if verbose > 0:
241 self.report_extrefs()
242 # Report errors last because the output may get truncated
243 self.report_errors()
244
245 def report_extrefs(self):
246 if not self.ext:
247 print
248 print "No external URLs"
249 return
250 print
251 print "External URLs:"
252 print
253 urls = self.ext.keys()
254 urls.sort()
255 for url in urls:
256 show("HREF ", url, " from", self.ext[url])
257
258 def report_errors(self):
259 if not self.bad:
260 print
261 print "No errors"
262 return
263 print
264 print "Error Report:"
265 urls = self.bad.keys()
266 urls.sort()
267 bysource = {}
268 for url in urls:
269 try:
270 origins = self.done[url]
271 except KeyError:
272 origins = self.todo[url]
273 for source, rawlink in origins:
274 triple = url, rawlink, self.bad[url]
275 try:
276 bysource[source].append(triple)
277 except KeyError:
278 bysource[source] = [triple]
279 sources = bysource.keys()
280 sources.sort()
281 for source in sources:
282 triples = bysource[source]
283 print
284 if len(triples) > 1:
285 print len(triples), "Errors in", source
286 else:
287 print "Error in", source
288 for url, rawlink, msg in triples:
289 print " HREF", url,
290 if rawlink != url: print "(%s)" % rawlink,
291 print
292 print " msg", msg
293
294 def dopage(self, url):
295 if verbose > 1:
296 if verbose > 2:
297 show("Page ", url, " from", self.todo[url])
298 else:
299 print "Page ", url
300 page = self.getpage(url)
301 if not page:
302 return
303 for info in page.getlinkinfos():
304 link, rawlink = info
305 origin = url, rawlink
306 if not self.inroots(link):
307 try:
308 self.ext[link].append(origin)
309 if verbose > 3:
310 print " New ext link", link,
311 if link != rawlink: print "(%s)" % rawlink,
312 print
313 except KeyError:
314 if verbose > 3:
315 print " Seen ext link", link,
316 if link != rawlink: print "(%s)" % rawlink,
317 print
318 self.ext[link] = [origin]
319 elif self.done.has_key(link):
320 if verbose > 3:
321 print " Done link", link
322 self.done[link].append(origin)
323 elif self.todo.has_key(link):
324 if verbose > 3:
325 print " Seen todo link", link
326 self.todo[link].append(origin)
327 else:
328 if verbose > 3:
329 print " New todo link", link
330 self.todo[link] = [origin]
331
332 def inroots(self, url):
333 for root in self.roots:
334 if url[:len(root)] == root:
335 return 1
336 return 0
337
338 def getpage(self, url):
339 ctype, encoding = mimetypes.guess_type(url)
340 if encoding:
341 if verbose > 2:
342 print " Won't bother, URL suggests encoding %s" % `encoding`
343 return None
344 if ctype and ctype != 'text/html':
345 if verbose > 2:
346 print " Won't bother, URL suggests mime type %s" % `ctype`
347 return None
348 try:
349 f = self.urlopener.open(url)
350 except IOError, msg:
351 if verbose > 0:
352 print "Error ", msg
353 if verbose > 0:
354 show(" HREF ", url, " from", self.todo[url])
355 self.bad[url] = msg
356 return None
357 nurl = f.geturl()
358 info = f.info()
359 if info.has_key('content-type'):
360 ctype = string.lower(info['content-type'])
361 if nurl != url:
362 if verbose > 1:
363 print "Redirected to", nurl
364 if not ctype:
365 ctype, encoding = mimetypes.guess_type(nurl)
366 if ctype != 'text/html':
367 f.close()
368 if verbose > 2:
369 print " Not HTML, mime type", ctype
370 return None
371 text = f.read()
372 f.close()
373 return Page(text, nurl)
374
375
376class Page:
377
378 def __init__(self, text, url):
379 self.text = text
380 self.url = url
381
382 def getlinkinfos(self):
383 size = len(self.text)
384 if size > maxpage:
385 if verbose > 0:
386 print "Skip huge file", self.url
387 print " (%.0f Kbytes)" % (size*0.001)
388 return []
389 if verbose > 2:
390 print " Parsing", self.url, "(%d bytes)" % size
391 parser = MyHTMLParser(formatter.NullFormatter())
392 parser.feed(self.text)
393 parser.close()
394 rawlinks = parser.getlinks()
395 base = urlparse.urljoin(self.url, parser.getbase() or "")
396 infos = []
397 for rawlink in rawlinks:
398 t = urlparse.urlparse(rawlink)
399 t = t[:-1] + ('',)
400 rawlink = urlparse.urlunparse(t)
401 link = urlparse.urljoin(base, rawlink)
402 infos.append((link, rawlink))
403 return infos
404
405
406class MyStringIO(StringIO.StringIO):
407
408 def __init__(self, url, info):
409 self.__url = url
410 self.__info = info
411 StringIO.StringIO.__init__(self)
412
413 def info(self):
414 return self.__info
415
416 def geturl(self):
417 return self.__url
418
419
420class MyURLopener(urllib.FancyURLopener):
421
422 http_error_default = urllib.URLopener.http_error_default
423
424 def open_file(self, url):
425 path = urllib.url2pathname(urllib.unquote(url))
426 if path[-1] != os.sep:
427 url = url + '/'
428 if os.path.isdir(path):
429 indexpath = os.path.join(path, "index.html")
430 if os.path.exists(indexpath):
431 return self.open_file(url + "index.html")
432 try:
433 names = os.listdir(path)
434 except os.error, msg:
435 raise IOError, msg, sys.exc_traceback
436 names.sort()
437 s = MyStringIO("file:"+url, {'content-type': 'text/html'})
438 s.write('<BASE HREF="file:%s">\n' %
439 urllib.quote(os.path.join(path, "")))
440 for name in names:
441 q = urllib.quote(name)
442 s.write('<A HREF="%s">%s</A>\n' % (q, q))
443 s.seek(0)
444 return s
445 return urllib.FancyURLopener.open_file(self, path)
446
447
448class MyHTMLParser(htmllib.HTMLParser):
449
450 def __init__(*args):
451 self = args[0]
452 self.base = None
453 self.links = []
454 apply(htmllib.HTMLParser.__init__, args)
455
456 def start_a(self, attributes):
457 for name, value in attributes:
458 if name == 'href' and value and value not in self.links:
459 self.links.append(string.strip(value))
460
461 def do_base(self, attributes):
462 for name, value in attributes:
463 if name == 'href' and value:
464 if verbose > 1:
465 print " Base", value
466 self.base = value
467
468 def getlinks(self):
469 return self.links
470
471 def getbase(self):
472 return self.base
473
474
475def show(p1, link, p2, origins):
476 print p1, link
477 i = 0
478 for source, rawlink in origins:
479 i = i+1
480 if i == 2:
481 p2 = ' '*len(p2)
482 print p2, source,
483 if rawlink != link: print "(%s)" % rawlink,
484 print
485
486
487if __name__ == '__main__':
488 main()