blob: d6c81cce64af9afebc1911c6eaf5d7f9106d07f0 [file] [log] [blame]
Guido van Rossum272b37d1997-01-30 02:44:48 +00001#! /usr/bin/env python
2
3"""Web tree checker.
4
5This utility is handy to check a subweb of the world-wide web for
6errors. A subweb is specified by giving one or more ``root URLs''; a
7page belongs to the subweb if one of the root URLs is an initial
8prefix of it.
9
10File URL extension:
11
12In order to easy the checking of subwebs via the local file system,
13the interpretation of ``file:'' URLs is extended to mimic the behavior
14of your average HTTP daemon: if a directory pathname is given, the
15file index.html in that directory is returned if it exists, otherwise
16a directory listing is returned. Now, you can point webchecker to the
17document tree in the local file system of your HTTP daemon, and have
18most of it checked. In fact the default works this way if your local
19web tree is located at /usr/local/etc/httpd/htdpcs (the default for
20the NCSA HTTP daemon and probably others).
21
22Reports printed:
23
24When done, it reports links to pages outside the web (unless -q is
25specified), and pages with bad links within the subweb. When
26interrupted, it print those same reports for the pages that it has
27checked already.
28
29In verbose mode, additional messages are printed during the
30information gathering phase. By default, it prints a summary of its
31work status every 50 URLs (adjustable with the -r option), and it
32reports errors as they are encountered. Use the -q option to disable
33this output.
34
35Checkpoint feature:
36
37Whether interrupted or not, it dumps its state (a Python pickle) to a
38checkpoint file and the -R option allows it to restart from the
39checkpoint (assuming that the pages on the subweb that were already
40processed haven't changed). Even when it has run till completion, -R
41can still be useful -- it will print the reports again, and -Rq prints
42the errors only. In this case, the checkpoint file is not written
43again. The checkpoint file can be set with the -d option.
44
45The checkpoint file is written as a Python pickle. Remember that
46Python's pickle module is currently quite slow. Give it the time it
47needs to load and save the checkpoint file. When interrupted while
48writing the checkpoint file, the old checkpoint file is not
49overwritten, but all work done in the current run is lost.
50
51Miscellaneous:
52
Guido van Rossum3edbb351997-01-30 03:19:41 +000053- Webchecker honors the "robots.txt" convention. Thanks to Skip
54Montanaro for his robotparser.py module (included in this directory)!
55The agent name is hardwired to "webchecker". URLs that are disallowed
56by the robots.txt file are reported as external URLs.
57
Guido van Rossum272b37d1997-01-30 02:44:48 +000058- Because the HTML parser is a bit slow, very large HTML files are
Guido van Rossum3edbb351997-01-30 03:19:41 +000059skipped. The size limit can be set with the -m option.
Guido van Rossum272b37d1997-01-30 02:44:48 +000060
61- Before fetching a page, it guesses its type based on its extension.
62If it is a known extension and the type is not text/http, the page is
63not fetched. This is a huge optimization but occasionally it means
64links can be missed. The mimetypes.py module (also in this directory)
65has a built-in table mapping most currently known suffixes, and in
66addition attempts to read the mime.types configuration files in the
67default locations of Netscape and the NCSA HTTP daemon.
68
69- It only follows links indicated by <A> tags. It doesn't follow
70links in <FORM> or <IMG> or whatever other tags might contain
71hyperlinks. It does honor the <BASE> tag.
72
Guido van Rossumde662681997-01-30 03:58:21 +000073- Checking external links is not done by default; use -x to enable
74this feature. This is done because checking external links usually
75takes a lot of time. When enabled, this check is executed during the
76report generation phase (so -x is ignored when -q is specified). Even
77when -x is enabled, only ``http:'' URLs are checked.
Guido van Rossum272b37d1997-01-30 02:44:48 +000078
79
80Usage: webchecker.py [option] ... [rooturl] ...
81
82Options:
83
84-R -- restart from checkpoint file
85-d file -- checkpoint filename (default %(DUMPFILE)s)
86-m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d)
87-q -- quiet operation (also suppresses external links report)
88-r number -- number of links processed per round (default %(ROUNDSIZE)d)
89-v -- verbose operation; repeating -v will increase verbosity
Guido van Rossumde662681997-01-30 03:58:21 +000090-x -- check external links (during report phase)
Guido van Rossum272b37d1997-01-30 02:44:48 +000091
92Arguments:
93
94rooturl -- URL to start checking
95 (default %(DEFROOT)s)
96
97"""
98
Guido van Rossum325a64f1997-01-30 03:30:20 +000099__version__ = "0.1"
100
Guido van Rossum272b37d1997-01-30 02:44:48 +0000101
102import sys
103import os
104from types import *
105import string
106import StringIO
107import getopt
108import pickle
109
110import urllib
111import urlparse
112import htmllib
113import formatter
114
115import mimetypes
Guido van Rossum3edbb351997-01-30 03:19:41 +0000116import robotparser
Guido van Rossum272b37d1997-01-30 02:44:48 +0000117
118
119# Tunable parameters
120DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
121MAXPAGE = 50000 # Ignore files bigger than this
122ROUNDSIZE = 50 # Number of links processed per round
123DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
Guido van Rossum3edbb351997-01-30 03:19:41 +0000124AGENTNAME = "webchecker" # Agent name for robots.txt parser
Guido van Rossum272b37d1997-01-30 02:44:48 +0000125
126
127# Global variables
128verbose = 1
129maxpage = MAXPAGE
130roundsize = ROUNDSIZE
131
132
133def main():
134 global verbose, maxpage, roundsize
135 dumpfile = DUMPFILE
136 restart = 0
Guido van Rossumde662681997-01-30 03:58:21 +0000137 checkext = 0
Guido van Rossum272b37d1997-01-30 02:44:48 +0000138
139 try:
Guido van Rossumde662681997-01-30 03:58:21 +0000140 opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:qr:vx')
Guido van Rossum272b37d1997-01-30 02:44:48 +0000141 except getopt.error, msg:
142 sys.stdout = sys.stderr
143 print msg
Guido van Rossum272b37d1997-01-30 02:44:48 +0000144 sys.exit(2)
145 for o, a in opts:
146 if o == '-R':
147 restart = 1
148 if o == '-d':
149 dumpfile = a
150 if o == '-m':
151 maxpage = string.atoi(a)
152 if o == '-q':
153 verbose = 0
154 if o == '-r':
155 roundsize = string.atoi(a)
156 if o == '-v':
157 verbose = verbose + 1
Guido van Rossumde662681997-01-30 03:58:21 +0000158 if o == '-x':
159 checkext = 1
Guido van Rossum272b37d1997-01-30 02:44:48 +0000160
Guido van Rossum325a64f1997-01-30 03:30:20 +0000161 if verbose:
162 print AGENTNAME, "version", __version__
163
Guido van Rossum272b37d1997-01-30 02:44:48 +0000164 if restart:
165 if verbose > 0:
166 print "Loading checkpoint from %s ..." % dumpfile
167 f = open(dumpfile, "rb")
168 c = pickle.load(f)
169 f.close()
170 if verbose > 0:
171 print "Done."
172 print "Root:", string.join(c.roots, "\n ")
173 else:
174 c = Checker()
175 if not args:
176 args.append(DEFROOT)
177
178 for arg in args:
179 c.addroot(arg)
180
181 if not c.todo:
182 needsave = 0
183 else:
184 needsave = 1
185 try:
186 c.run()
187 except KeyboardInterrupt:
188 if verbose > 0:
Guido van Rossumde662681997-01-30 03:58:21 +0000189 print "[run interrupted]"
190 try:
191 c.report(checkext)
192 except KeyboardInterrupt:
193 if verbose > 0:
194 print "[report interrupted]"
Guido van Rossum272b37d1997-01-30 02:44:48 +0000195 if not needsave:
196 if verbose > 0:
197 print
198 print "No need to save checkpoint"
199 elif dumpfile:
200 if verbose > 0:
201 print
202 print "Saving checkpoint to %s ..." % dumpfile
203 newfile = dumpfile + ".new"
204 f = open(newfile, "wb")
205 pickle.dump(c, f)
206 f.flush()
207 f.close()
208 try:
209 os.unlink(dumpfile)
210 except os.error:
211 pass
212 os.rename(newfile, dumpfile)
213 if verbose > 0:
214 print "Done."
215 if dumpfile == DUMPFILE:
216 print "Use ``%s -R'' to restart." % sys.argv[0]
217 else:
218 print "Use ``%s -R -d %s'' to restart." % (sys.argv[0],
219 dumpfile)
220
221
222class Checker:
223
224 def __init__(self):
225 self.roots = []
226 self.todo = {}
227 self.done = {}
228 self.ext = {}
229 self.bad = {}
230 self.urlopener = MyURLopener()
231 self.round = 0
Guido van Rossum3edbb351997-01-30 03:19:41 +0000232 self.robots = {}
233
234 def __getstate__(self):
235 return (self.roots, self.todo, self.done,
236 self.ext, self.bad, self.round)
237
238 def __setstate__(self, state):
239 (self.roots, self.todo, self.done,
240 self.ext, self.bad, self.round) = state
241 for root in self.roots:
242 self.addrobot(root)
Guido van Rossum272b37d1997-01-30 02:44:48 +0000243
244 def addroot(self, root):
245 if root not in self.roots:
246 self.roots.append(root)
247 self.todo[root] = []
Guido van Rossum3edbb351997-01-30 03:19:41 +0000248 self.addrobot(root)
249
250 def addrobot(self, root):
Guido van Rossum3edbb351997-01-30 03:19:41 +0000251 url = urlparse.urljoin(root, "/robots.txt")
Guido van Rossum325a64f1997-01-30 03:30:20 +0000252 self.robots[root] = rp = robotparser.RobotFileParser()
253 if verbose > 2:
254 print "Parsing", url
255 rp.debug = 1
Guido van Rossum3edbb351997-01-30 03:19:41 +0000256 rp.set_url(url)
Guido van Rossum325a64f1997-01-30 03:30:20 +0000257 try:
258 rp.read()
259 except IOError, msg:
260 if verbose > 1:
261 print "I/O error parsing", url, ":", msg
Guido van Rossum272b37d1997-01-30 02:44:48 +0000262
263 def run(self):
264 while self.todo:
265 self.round = self.round + 1
266 if verbose > 0:
267 print
268 print "Round", self.round,
269 print "(%d to do, %d done, %d external, %d bad)" % (
270 len(self.todo), len(self.done),
271 len(self.ext), len(self.bad))
272 print
273 urls = self.todo.keys()[:roundsize]
274 for url in urls:
275 self.dopage(url)
276 self.done[url] = self.todo[url]
277 del self.todo[url]
278
Guido van Rossumde662681997-01-30 03:58:21 +0000279 def report(self, checkext=0):
Guido van Rossum272b37d1997-01-30 02:44:48 +0000280 print
281 if not self.todo: print "Final",
282 else: print "Interim",
283 print "Report (%d to do, %d done, %d external, %d bad)" % (
284 len(self.todo), len(self.done),
285 len(self.ext), len(self.bad))
286 if verbose > 0:
Guido van Rossumde662681997-01-30 03:58:21 +0000287 self.report_extrefs(checkext)
Guido van Rossum272b37d1997-01-30 02:44:48 +0000288 # Report errors last because the output may get truncated
289 self.report_errors()
290
Guido van Rossumde662681997-01-30 03:58:21 +0000291 def report_extrefs(self, checkext=0):
Guido van Rossum272b37d1997-01-30 02:44:48 +0000292 if not self.ext:
293 print
294 print "No external URLs"
295 return
296 print
Guido van Rossumde662681997-01-30 03:58:21 +0000297 if checkext:
298 print "External URLs (checking validity):"
299 else:
300 print "External URLs (not checked):"
Guido van Rossum272b37d1997-01-30 02:44:48 +0000301 print
302 urls = self.ext.keys()
303 urls.sort()
304 for url in urls:
305 show("HREF ", url, " from", self.ext[url])
Guido van Rossumde662681997-01-30 03:58:21 +0000306 if not checkext:
307 continue
308 if verbose > 2: print "Checking", url, "..."
309 try:
310 f = self.urlopener.open(url)
311 f.close()
312 if verbose > 3: print "OK"
313 except IOError, msg:
314 print "Error:", msg
Guido van Rossum272b37d1997-01-30 02:44:48 +0000315
316 def report_errors(self):
317 if not self.bad:
318 print
319 print "No errors"
320 return
321 print
322 print "Error Report:"
323 urls = self.bad.keys()
324 urls.sort()
325 bysource = {}
326 for url in urls:
327 try:
328 origins = self.done[url]
329 except KeyError:
330 origins = self.todo[url]
331 for source, rawlink in origins:
332 triple = url, rawlink, self.bad[url]
333 try:
334 bysource[source].append(triple)
335 except KeyError:
336 bysource[source] = [triple]
337 sources = bysource.keys()
338 sources.sort()
339 for source in sources:
340 triples = bysource[source]
341 print
342 if len(triples) > 1:
343 print len(triples), "Errors in", source
344 else:
345 print "Error in", source
346 for url, rawlink, msg in triples:
347 print " HREF", url,
348 if rawlink != url: print "(%s)" % rawlink,
349 print
350 print " msg", msg
351
352 def dopage(self, url):
353 if verbose > 1:
354 if verbose > 2:
355 show("Page ", url, " from", self.todo[url])
356 else:
357 print "Page ", url
358 page = self.getpage(url)
359 if not page:
360 return
361 for info in page.getlinkinfos():
362 link, rawlink = info
363 origin = url, rawlink
364 if not self.inroots(link):
365 try:
366 self.ext[link].append(origin)
367 if verbose > 3:
368 print " New ext link", link,
369 if link != rawlink: print "(%s)" % rawlink,
370 print
371 except KeyError:
372 if verbose > 3:
373 print " Seen ext link", link,
374 if link != rawlink: print "(%s)" % rawlink,
375 print
376 self.ext[link] = [origin]
377 elif self.done.has_key(link):
378 if verbose > 3:
379 print " Done link", link
380 self.done[link].append(origin)
381 elif self.todo.has_key(link):
382 if verbose > 3:
383 print " Seen todo link", link
384 self.todo[link].append(origin)
385 else:
386 if verbose > 3:
387 print " New todo link", link
388 self.todo[link] = [origin]
389
390 def inroots(self, url):
391 for root in self.roots:
392 if url[:len(root)] == root:
Guido van Rossum3edbb351997-01-30 03:19:41 +0000393 return self.robots[root].can_fetch(AGENTNAME, url)
Guido van Rossum272b37d1997-01-30 02:44:48 +0000394 return 0
395
396 def getpage(self, url):
397 ctype, encoding = mimetypes.guess_type(url)
398 if encoding:
399 if verbose > 2:
400 print " Won't bother, URL suggests encoding %s" % `encoding`
401 return None
402 if ctype and ctype != 'text/html':
403 if verbose > 2:
404 print " Won't bother, URL suggests mime type %s" % `ctype`
405 return None
406 try:
407 f = self.urlopener.open(url)
408 except IOError, msg:
Guido van Rossum3edbb351997-01-30 03:19:41 +0000409 if (type(msg) == TupleType and
410 len(msg) >= 4 and
411 msg[0] == 'http error' and
412 type(msg[3]) == InstanceType):
413 # Remove the Message instance -- it may contain
414 # a file object which prevents pickling.
415 msg = msg[:3] + msg[4:]
Guido van Rossum272b37d1997-01-30 02:44:48 +0000416 if verbose > 0:
417 print "Error ", msg
418 if verbose > 0:
419 show(" HREF ", url, " from", self.todo[url])
420 self.bad[url] = msg
421 return None
422 nurl = f.geturl()
423 info = f.info()
424 if info.has_key('content-type'):
425 ctype = string.lower(info['content-type'])
426 if nurl != url:
427 if verbose > 1:
Guido van Rossum3edbb351997-01-30 03:19:41 +0000428 print " Redirected to", nurl
Guido van Rossum272b37d1997-01-30 02:44:48 +0000429 if not ctype:
430 ctype, encoding = mimetypes.guess_type(nurl)
431 if ctype != 'text/html':
432 f.close()
433 if verbose > 2:
434 print " Not HTML, mime type", ctype
435 return None
436 text = f.read()
437 f.close()
438 return Page(text, nurl)
439
440
441class Page:
442
443 def __init__(self, text, url):
444 self.text = text
445 self.url = url
446
447 def getlinkinfos(self):
448 size = len(self.text)
449 if size > maxpage:
450 if verbose > 0:
451 print "Skip huge file", self.url
452 print " (%.0f Kbytes)" % (size*0.001)
453 return []
454 if verbose > 2:
455 print " Parsing", self.url, "(%d bytes)" % size
456 parser = MyHTMLParser(formatter.NullFormatter())
457 parser.feed(self.text)
458 parser.close()
459 rawlinks = parser.getlinks()
460 base = urlparse.urljoin(self.url, parser.getbase() or "")
461 infos = []
462 for rawlink in rawlinks:
463 t = urlparse.urlparse(rawlink)
464 t = t[:-1] + ('',)
465 rawlink = urlparse.urlunparse(t)
466 link = urlparse.urljoin(base, rawlink)
467 infos.append((link, rawlink))
468 return infos
469
470
471class MyStringIO(StringIO.StringIO):
472
473 def __init__(self, url, info):
474 self.__url = url
475 self.__info = info
476 StringIO.StringIO.__init__(self)
477
478 def info(self):
479 return self.__info
480
481 def geturl(self):
482 return self.__url
483
484
485class MyURLopener(urllib.FancyURLopener):
486
487 http_error_default = urllib.URLopener.http_error_default
488
489 def open_file(self, url):
490 path = urllib.url2pathname(urllib.unquote(url))
491 if path[-1] != os.sep:
492 url = url + '/'
493 if os.path.isdir(path):
494 indexpath = os.path.join(path, "index.html")
495 if os.path.exists(indexpath):
496 return self.open_file(url + "index.html")
497 try:
498 names = os.listdir(path)
499 except os.error, msg:
500 raise IOError, msg, sys.exc_traceback
501 names.sort()
502 s = MyStringIO("file:"+url, {'content-type': 'text/html'})
503 s.write('<BASE HREF="file:%s">\n' %
504 urllib.quote(os.path.join(path, "")))
505 for name in names:
506 q = urllib.quote(name)
507 s.write('<A HREF="%s">%s</A>\n' % (q, q))
508 s.seek(0)
509 return s
510 return urllib.FancyURLopener.open_file(self, path)
511
512
513class MyHTMLParser(htmllib.HTMLParser):
514
515 def __init__(*args):
516 self = args[0]
517 self.base = None
518 self.links = []
519 apply(htmllib.HTMLParser.__init__, args)
520
521 def start_a(self, attributes):
522 for name, value in attributes:
523 if name == 'href' and value and value not in self.links:
524 self.links.append(string.strip(value))
525
526 def do_base(self, attributes):
527 for name, value in attributes:
528 if name == 'href' and value:
529 if verbose > 1:
530 print " Base", value
531 self.base = value
532
533 def getlinks(self):
534 return self.links
535
536 def getbase(self):
537 return self.base
538
539
540def show(p1, link, p2, origins):
541 print p1, link
542 i = 0
543 for source, rawlink in origins:
544 i = i+1
545 if i == 2:
546 p2 = ' '*len(p2)
547 print p2, source,
548 if rawlink != link: print "(%s)" % rawlink,
549 print
550
551
552if __name__ == '__main__':
553 main()