blob: 7eb9a250d5523e7df491350edadf6fcaad049856 [file] [log] [blame]
Guido van Rossum272b37d1997-01-30 02:44:48 +00001#! /usr/bin/env python
2
3"""Web tree checker.
4
5This utility is handy to check a subweb of the world-wide web for
6errors. A subweb is specified by giving one or more ``root URLs''; a
7page belongs to the subweb if one of the root URLs is an initial
8prefix of it.
9
10File URL extension:
11
12In order to easy the checking of subwebs via the local file system,
13the interpretation of ``file:'' URLs is extended to mimic the behavior
14of your average HTTP daemon: if a directory pathname is given, the
15file index.html in that directory is returned if it exists, otherwise
16a directory listing is returned. Now, you can point webchecker to the
17document tree in the local file system of your HTTP daemon, and have
18most of it checked. In fact the default works this way if your local
19web tree is located at /usr/local/etc/httpd/htdpcs (the default for
20the NCSA HTTP daemon and probably others).
21
22Reports printed:
23
24When done, it reports links to pages outside the web (unless -q is
25specified), and pages with bad links within the subweb. When
26interrupted, it print those same reports for the pages that it has
27checked already.
28
29In verbose mode, additional messages are printed during the
30information gathering phase. By default, it prints a summary of its
31work status every 50 URLs (adjustable with the -r option), and it
32reports errors as they are encountered. Use the -q option to disable
33this output.
34
35Checkpoint feature:
36
37Whether interrupted or not, it dumps its state (a Python pickle) to a
38checkpoint file and the -R option allows it to restart from the
39checkpoint (assuming that the pages on the subweb that were already
40processed haven't changed). Even when it has run till completion, -R
41can still be useful -- it will print the reports again, and -Rq prints
42the errors only. In this case, the checkpoint file is not written
43again. The checkpoint file can be set with the -d option.
44
45The checkpoint file is written as a Python pickle. Remember that
46Python's pickle module is currently quite slow. Give it the time it
47needs to load and save the checkpoint file. When interrupted while
48writing the checkpoint file, the old checkpoint file is not
49overwritten, but all work done in the current run is lost.
50
51Miscellaneous:
52
Guido van Rossum3edbb351997-01-30 03:19:41 +000053- Webchecker honors the "robots.txt" convention. Thanks to Skip
54Montanaro for his robotparser.py module (included in this directory)!
55The agent name is hardwired to "webchecker". URLs that are disallowed
56by the robots.txt file are reported as external URLs.
57
Guido van Rossum272b37d1997-01-30 02:44:48 +000058- Because the HTML parser is a bit slow, very large HTML files are
Guido van Rossum3edbb351997-01-30 03:19:41 +000059skipped. The size limit can be set with the -m option.
Guido van Rossum272b37d1997-01-30 02:44:48 +000060
61- Before fetching a page, it guesses its type based on its extension.
62If it is a known extension and the type is not text/http, the page is
63not fetched. This is a huge optimization but occasionally it means
64links can be missed. The mimetypes.py module (also in this directory)
65has a built-in table mapping most currently known suffixes, and in
66addition attempts to read the mime.types configuration files in the
67default locations of Netscape and the NCSA HTTP daemon.
68
69- It only follows links indicated by <A> tags. It doesn't follow
70links in <FORM> or <IMG> or whatever other tags might contain
71hyperlinks. It does honor the <BASE> tag.
72
Guido van Rossumde662681997-01-30 03:58:21 +000073- Checking external links is not done by default; use -x to enable
74this feature. This is done because checking external links usually
75takes a lot of time. When enabled, this check is executed during the
76report generation phase (so -x is ignored when -q is specified). Even
77when -x is enabled, only ``http:'' URLs are checked.
Guido van Rossum272b37d1997-01-30 02:44:48 +000078
79
80Usage: webchecker.py [option] ... [rooturl] ...
81
82Options:
83
84-R -- restart from checkpoint file
85-d file -- checkpoint filename (default %(DUMPFILE)s)
86-m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d)
87-q -- quiet operation (also suppresses external links report)
88-r number -- number of links processed per round (default %(ROUNDSIZE)d)
89-v -- verbose operation; repeating -v will increase verbosity
Guido van Rossumde662681997-01-30 03:58:21 +000090-x -- check external links (during report phase)
Guido van Rossum272b37d1997-01-30 02:44:48 +000091
92Arguments:
93
94rooturl -- URL to start checking
95 (default %(DEFROOT)s)
96
97"""
98
Guido van Rossum325a64f1997-01-30 03:30:20 +000099__version__ = "0.1"
100
Guido van Rossum272b37d1997-01-30 02:44:48 +0000101
102import sys
103import os
104from types import *
105import string
106import StringIO
107import getopt
108import pickle
109
110import urllib
111import urlparse
112import htmllib
113import formatter
114
115import mimetypes
Guido van Rossum3edbb351997-01-30 03:19:41 +0000116import robotparser
Guido van Rossum272b37d1997-01-30 02:44:48 +0000117
118
119# Tunable parameters
120DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
121MAXPAGE = 50000 # Ignore files bigger than this
122ROUNDSIZE = 50 # Number of links processed per round
123DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
Guido van Rossum3edbb351997-01-30 03:19:41 +0000124AGENTNAME = "webchecker" # Agent name for robots.txt parser
Guido van Rossum272b37d1997-01-30 02:44:48 +0000125
126
127# Global variables
128verbose = 1
129maxpage = MAXPAGE
130roundsize = ROUNDSIZE
131
132
133def main():
134 global verbose, maxpage, roundsize
135 dumpfile = DUMPFILE
136 restart = 0
Guido van Rossumde662681997-01-30 03:58:21 +0000137 checkext = 0
Guido van Rossum272b37d1997-01-30 02:44:48 +0000138
139 try:
Guido van Rossumde662681997-01-30 03:58:21 +0000140 opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:qr:vx')
Guido van Rossum272b37d1997-01-30 02:44:48 +0000141 except getopt.error, msg:
142 sys.stdout = sys.stderr
143 print msg
Guido van Rossum272b37d1997-01-30 02:44:48 +0000144 sys.exit(2)
145 for o, a in opts:
146 if o == '-R':
147 restart = 1
148 if o == '-d':
149 dumpfile = a
150 if o == '-m':
151 maxpage = string.atoi(a)
152 if o == '-q':
153 verbose = 0
154 if o == '-r':
155 roundsize = string.atoi(a)
156 if o == '-v':
157 verbose = verbose + 1
Guido van Rossumde662681997-01-30 03:58:21 +0000158 if o == '-x':
159 checkext = 1
Guido van Rossum272b37d1997-01-30 02:44:48 +0000160
Guido van Rossum325a64f1997-01-30 03:30:20 +0000161 if verbose:
162 print AGENTNAME, "version", __version__
163
Guido van Rossum272b37d1997-01-30 02:44:48 +0000164 if restart:
165 if verbose > 0:
166 print "Loading checkpoint from %s ..." % dumpfile
167 f = open(dumpfile, "rb")
168 c = pickle.load(f)
169 f.close()
170 if verbose > 0:
171 print "Done."
172 print "Root:", string.join(c.roots, "\n ")
173 else:
174 c = Checker()
175 if not args:
176 args.append(DEFROOT)
177
178 for arg in args:
179 c.addroot(arg)
180
181 if not c.todo:
182 needsave = 0
183 else:
184 needsave = 1
185 try:
186 c.run()
187 except KeyboardInterrupt:
188 if verbose > 0:
Guido van Rossumde662681997-01-30 03:58:21 +0000189 print "[run interrupted]"
190 try:
191 c.report(checkext)
192 except KeyboardInterrupt:
193 if verbose > 0:
194 print "[report interrupted]"
Guido van Rossum272b37d1997-01-30 02:44:48 +0000195 if not needsave:
196 if verbose > 0:
197 print
198 print "No need to save checkpoint"
199 elif dumpfile:
200 if verbose > 0:
201 print
202 print "Saving checkpoint to %s ..." % dumpfile
203 newfile = dumpfile + ".new"
204 f = open(newfile, "wb")
205 pickle.dump(c, f)
206 f.flush()
207 f.close()
208 try:
209 os.unlink(dumpfile)
210 except os.error:
211 pass
212 os.rename(newfile, dumpfile)
213 if verbose > 0:
214 print "Done."
215 if dumpfile == DUMPFILE:
216 print "Use ``%s -R'' to restart." % sys.argv[0]
217 else:
218 print "Use ``%s -R -d %s'' to restart." % (sys.argv[0],
219 dumpfile)
220
221
222class Checker:
223
224 def __init__(self):
225 self.roots = []
226 self.todo = {}
227 self.done = {}
228 self.ext = {}
229 self.bad = {}
230 self.urlopener = MyURLopener()
231 self.round = 0
Guido van Rossum3edbb351997-01-30 03:19:41 +0000232 self.robots = {}
233
234 def __getstate__(self):
235 return (self.roots, self.todo, self.done,
236 self.ext, self.bad, self.round)
237
238 def __setstate__(self, state):
239 (self.roots, self.todo, self.done,
240 self.ext, self.bad, self.round) = state
241 for root in self.roots:
242 self.addrobot(root)
Guido van Rossum272b37d1997-01-30 02:44:48 +0000243
244 def addroot(self, root):
245 if root not in self.roots:
246 self.roots.append(root)
247 self.todo[root] = []
Guido van Rossum3edbb351997-01-30 03:19:41 +0000248 self.addrobot(root)
249
250 def addrobot(self, root):
Guido van Rossum3edbb351997-01-30 03:19:41 +0000251 url = urlparse.urljoin(root, "/robots.txt")
Guido van Rossum325a64f1997-01-30 03:30:20 +0000252 self.robots[root] = rp = robotparser.RobotFileParser()
253 if verbose > 2:
254 print "Parsing", url
255 rp.debug = 1
Guido van Rossum3edbb351997-01-30 03:19:41 +0000256 rp.set_url(url)
Guido van Rossum325a64f1997-01-30 03:30:20 +0000257 try:
258 rp.read()
259 except IOError, msg:
260 if verbose > 1:
261 print "I/O error parsing", url, ":", msg
Guido van Rossum272b37d1997-01-30 02:44:48 +0000262
263 def run(self):
264 while self.todo:
265 self.round = self.round + 1
266 if verbose > 0:
267 print
268 print "Round", self.round,
269 print "(%d to do, %d done, %d external, %d bad)" % (
270 len(self.todo), len(self.done),
271 len(self.ext), len(self.bad))
272 print
273 urls = self.todo.keys()[:roundsize]
274 for url in urls:
275 self.dopage(url)
276 self.done[url] = self.todo[url]
277 del self.todo[url]
278
Guido van Rossumde662681997-01-30 03:58:21 +0000279 def report(self, checkext=0):
Guido van Rossum272b37d1997-01-30 02:44:48 +0000280 print
281 if not self.todo: print "Final",
282 else: print "Interim",
283 print "Report (%d to do, %d done, %d external, %d bad)" % (
284 len(self.todo), len(self.done),
285 len(self.ext), len(self.bad))
286 if verbose > 0:
Guido van Rossumde662681997-01-30 03:58:21 +0000287 self.report_extrefs(checkext)
Guido van Rossum272b37d1997-01-30 02:44:48 +0000288 # Report errors last because the output may get truncated
289 self.report_errors()
290
Guido van Rossumde662681997-01-30 03:58:21 +0000291 def report_extrefs(self, checkext=0):
Guido van Rossum272b37d1997-01-30 02:44:48 +0000292 if not self.ext:
293 print
294 print "No external URLs"
295 return
296 print
Guido van Rossumde662681997-01-30 03:58:21 +0000297 if checkext:
298 print "External URLs (checking validity):"
299 else:
300 print "External URLs (not checked):"
Guido van Rossum272b37d1997-01-30 02:44:48 +0000301 print
302 urls = self.ext.keys()
303 urls.sort()
304 for url in urls:
305 show("HREF ", url, " from", self.ext[url])
Guido van Rossumde662681997-01-30 03:58:21 +0000306 if not checkext:
307 continue
Guido van Rossum2739cd71997-01-30 04:26:57 +0000308 if url[:7] == 'mailto:':
309 if verbose > 2: print "Not checking", url
310 continue
Guido van Rossumde662681997-01-30 03:58:21 +0000311 if verbose > 2: print "Checking", url, "..."
312 try:
313 f = self.urlopener.open(url)
314 f.close()
315 if verbose > 3: print "OK"
316 except IOError, msg:
Guido van Rossum2739cd71997-01-30 04:26:57 +0000317 msg = sanitize(msg)
318 print "Error", msg
319 self.bad[url] = msg
Guido van Rossum272b37d1997-01-30 02:44:48 +0000320
321 def report_errors(self):
322 if not self.bad:
323 print
324 print "No errors"
325 return
326 print
327 print "Error Report:"
328 urls = self.bad.keys()
329 urls.sort()
330 bysource = {}
331 for url in urls:
332 try:
333 origins = self.done[url]
334 except KeyError:
Guido van Rossum2739cd71997-01-30 04:26:57 +0000335 try:
336 origins = self.todo[url]
337 except KeyError:
338 origins = self.ext[url]
Guido van Rossum272b37d1997-01-30 02:44:48 +0000339 for source, rawlink in origins:
340 triple = url, rawlink, self.bad[url]
341 try:
342 bysource[source].append(triple)
343 except KeyError:
344 bysource[source] = [triple]
345 sources = bysource.keys()
346 sources.sort()
347 for source in sources:
348 triples = bysource[source]
349 print
350 if len(triples) > 1:
351 print len(triples), "Errors in", source
352 else:
353 print "Error in", source
354 for url, rawlink, msg in triples:
355 print " HREF", url,
356 if rawlink != url: print "(%s)" % rawlink,
357 print
358 print " msg", msg
359
360 def dopage(self, url):
361 if verbose > 1:
362 if verbose > 2:
363 show("Page ", url, " from", self.todo[url])
364 else:
365 print "Page ", url
366 page = self.getpage(url)
367 if not page:
368 return
369 for info in page.getlinkinfos():
370 link, rawlink = info
371 origin = url, rawlink
372 if not self.inroots(link):
373 try:
374 self.ext[link].append(origin)
375 if verbose > 3:
376 print " New ext link", link,
377 if link != rawlink: print "(%s)" % rawlink,
378 print
379 except KeyError:
380 if verbose > 3:
381 print " Seen ext link", link,
382 if link != rawlink: print "(%s)" % rawlink,
383 print
384 self.ext[link] = [origin]
385 elif self.done.has_key(link):
386 if verbose > 3:
387 print " Done link", link
388 self.done[link].append(origin)
389 elif self.todo.has_key(link):
390 if verbose > 3:
391 print " Seen todo link", link
392 self.todo[link].append(origin)
393 else:
394 if verbose > 3:
395 print " New todo link", link
396 self.todo[link] = [origin]
397
398 def inroots(self, url):
399 for root in self.roots:
400 if url[:len(root)] == root:
Guido van Rossum3edbb351997-01-30 03:19:41 +0000401 return self.robots[root].can_fetch(AGENTNAME, url)
Guido van Rossum272b37d1997-01-30 02:44:48 +0000402 return 0
403
404 def getpage(self, url):
405 ctype, encoding = mimetypes.guess_type(url)
406 if encoding:
407 if verbose > 2:
408 print " Won't bother, URL suggests encoding %s" % `encoding`
409 return None
410 if ctype and ctype != 'text/html':
411 if verbose > 2:
412 print " Won't bother, URL suggests mime type %s" % `ctype`
413 return None
414 try:
415 f = self.urlopener.open(url)
416 except IOError, msg:
Guido van Rossum2739cd71997-01-30 04:26:57 +0000417 msg = sanitize(msg)
Guido van Rossum272b37d1997-01-30 02:44:48 +0000418 if verbose > 0:
419 print "Error ", msg
420 if verbose > 0:
421 show(" HREF ", url, " from", self.todo[url])
422 self.bad[url] = msg
423 return None
424 nurl = f.geturl()
425 info = f.info()
426 if info.has_key('content-type'):
427 ctype = string.lower(info['content-type'])
428 if nurl != url:
429 if verbose > 1:
Guido van Rossum3edbb351997-01-30 03:19:41 +0000430 print " Redirected to", nurl
Guido van Rossum272b37d1997-01-30 02:44:48 +0000431 if not ctype:
432 ctype, encoding = mimetypes.guess_type(nurl)
433 if ctype != 'text/html':
434 f.close()
435 if verbose > 2:
436 print " Not HTML, mime type", ctype
437 return None
438 text = f.read()
439 f.close()
440 return Page(text, nurl)
441
442
443class Page:
444
445 def __init__(self, text, url):
446 self.text = text
447 self.url = url
448
449 def getlinkinfos(self):
450 size = len(self.text)
451 if size > maxpage:
452 if verbose > 0:
453 print "Skip huge file", self.url
454 print " (%.0f Kbytes)" % (size*0.001)
455 return []
456 if verbose > 2:
457 print " Parsing", self.url, "(%d bytes)" % size
458 parser = MyHTMLParser(formatter.NullFormatter())
459 parser.feed(self.text)
460 parser.close()
461 rawlinks = parser.getlinks()
462 base = urlparse.urljoin(self.url, parser.getbase() or "")
463 infos = []
464 for rawlink in rawlinks:
465 t = urlparse.urlparse(rawlink)
466 t = t[:-1] + ('',)
467 rawlink = urlparse.urlunparse(t)
468 link = urlparse.urljoin(base, rawlink)
469 infos.append((link, rawlink))
470 return infos
471
472
473class MyStringIO(StringIO.StringIO):
474
475 def __init__(self, url, info):
476 self.__url = url
477 self.__info = info
478 StringIO.StringIO.__init__(self)
479
480 def info(self):
481 return self.__info
482
483 def geturl(self):
484 return self.__url
485
486
487class MyURLopener(urllib.FancyURLopener):
488
489 http_error_default = urllib.URLopener.http_error_default
490
491 def open_file(self, url):
492 path = urllib.url2pathname(urllib.unquote(url))
493 if path[-1] != os.sep:
494 url = url + '/'
495 if os.path.isdir(path):
496 indexpath = os.path.join(path, "index.html")
497 if os.path.exists(indexpath):
498 return self.open_file(url + "index.html")
499 try:
500 names = os.listdir(path)
501 except os.error, msg:
502 raise IOError, msg, sys.exc_traceback
503 names.sort()
504 s = MyStringIO("file:"+url, {'content-type': 'text/html'})
505 s.write('<BASE HREF="file:%s">\n' %
506 urllib.quote(os.path.join(path, "")))
507 for name in names:
508 q = urllib.quote(name)
509 s.write('<A HREF="%s">%s</A>\n' % (q, q))
510 s.seek(0)
511 return s
512 return urllib.FancyURLopener.open_file(self, path)
513
514
515class MyHTMLParser(htmllib.HTMLParser):
516
517 def __init__(*args):
518 self = args[0]
519 self.base = None
520 self.links = []
521 apply(htmllib.HTMLParser.__init__, args)
522
523 def start_a(self, attributes):
524 for name, value in attributes:
525 if name == 'href' and value and value not in self.links:
526 self.links.append(string.strip(value))
527
528 def do_base(self, attributes):
529 for name, value in attributes:
530 if name == 'href' and value:
531 if verbose > 1:
532 print " Base", value
533 self.base = value
534
535 def getlinks(self):
536 return self.links
537
538 def getbase(self):
539 return self.base
540
541
542def show(p1, link, p2, origins):
543 print p1, link
544 i = 0
545 for source, rawlink in origins:
546 i = i+1
547 if i == 2:
548 p2 = ' '*len(p2)
549 print p2, source,
550 if rawlink != link: print "(%s)" % rawlink,
551 print
552
553
Guido van Rossum2739cd71997-01-30 04:26:57 +0000554def sanitize(msg):
555 if (type(msg) == TupleType and
556 len(msg) >= 4 and
557 msg[0] == 'http error' and
558 type(msg[3]) == InstanceType):
559 # Remove the Message instance -- it may contain
560 # a file object which prevents pickling.
561 msg = msg[:3] + msg[4:]
562 return msg
563
564
Guido van Rossum272b37d1997-01-30 02:44:48 +0000565if __name__ == '__main__':
566 main()