blob: 6b6fbf5bb781ee1e657148234751c9ba88eea94a [file] [log] [blame]
Guido van Rossum272b37d1997-01-30 02:44:48 +00001#! /usr/bin/env python
2
3"""Web tree checker.
4
5This utility is handy to check a subweb of the world-wide web for
6errors. A subweb is specified by giving one or more ``root URLs''; a
7page belongs to the subweb if one of the root URLs is an initial
8prefix of it.
9
10File URL extension:
11
12In order to easy the checking of subwebs via the local file system,
13the interpretation of ``file:'' URLs is extended to mimic the behavior
14of your average HTTP daemon: if a directory pathname is given, the
15file index.html in that directory is returned if it exists, otherwise
16a directory listing is returned. Now, you can point webchecker to the
17document tree in the local file system of your HTTP daemon, and have
18most of it checked. In fact the default works this way if your local
19web tree is located at /usr/local/etc/httpd/htdpcs (the default for
20the NCSA HTTP daemon and probably others).
21
22Reports printed:
23
24When done, it reports links to pages outside the web (unless -q is
25specified), and pages with bad links within the subweb. When
26interrupted, it print those same reports for the pages that it has
27checked already.
28
29In verbose mode, additional messages are printed during the
30information gathering phase. By default, it prints a summary of its
31work status every 50 URLs (adjustable with the -r option), and it
32reports errors as they are encountered. Use the -q option to disable
33this output.
34
35Checkpoint feature:
36
37Whether interrupted or not, it dumps its state (a Python pickle) to a
38checkpoint file and the -R option allows it to restart from the
39checkpoint (assuming that the pages on the subweb that were already
40processed haven't changed). Even when it has run till completion, -R
41can still be useful -- it will print the reports again, and -Rq prints
42the errors only. In this case, the checkpoint file is not written
43again. The checkpoint file can be set with the -d option.
44
45The checkpoint file is written as a Python pickle. Remember that
46Python's pickle module is currently quite slow. Give it the time it
47needs to load and save the checkpoint file. When interrupted while
48writing the checkpoint file, the old checkpoint file is not
49overwritten, but all work done in the current run is lost.
50
51Miscellaneous:
52
Guido van Rossum3edbb351997-01-30 03:19:41 +000053- Webchecker honors the "robots.txt" convention. Thanks to Skip
54Montanaro for his robotparser.py module (included in this directory)!
55The agent name is hardwired to "webchecker". URLs that are disallowed
56by the robots.txt file are reported as external URLs.
57
Guido van Rossum272b37d1997-01-30 02:44:48 +000058- Because the HTML parser is a bit slow, very large HTML files are
Guido van Rossum3edbb351997-01-30 03:19:41 +000059skipped. The size limit can be set with the -m option.
Guido van Rossum272b37d1997-01-30 02:44:48 +000060
61- Before fetching a page, it guesses its type based on its extension.
62If it is a known extension and the type is not text/http, the page is
63not fetched. This is a huge optimization but occasionally it means
64links can be missed. The mimetypes.py module (also in this directory)
65has a built-in table mapping most currently known suffixes, and in
66addition attempts to read the mime.types configuration files in the
67default locations of Netscape and the NCSA HTTP daemon.
68
69- It only follows links indicated by <A> tags. It doesn't follow
70links in <FORM> or <IMG> or whatever other tags might contain
71hyperlinks. It does honor the <BASE> tag.
72
73- It could be argued that it should also check external links for
74validity. This is true, but is is more error-prone. I think I will
75make this an option in the future.
76
77
78Usage: webchecker.py [option] ... [rooturl] ...
79
80Options:
81
82-R -- restart from checkpoint file
83-d file -- checkpoint filename (default %(DUMPFILE)s)
84-m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d)
85-q -- quiet operation (also suppresses external links report)
86-r number -- number of links processed per round (default %(ROUNDSIZE)d)
87-v -- verbose operation; repeating -v will increase verbosity
88
89Arguments:
90
91rooturl -- URL to start checking
92 (default %(DEFROOT)s)
93
94"""
95
96
97import sys
98import os
99from types import *
100import string
101import StringIO
102import getopt
103import pickle
104
105import urllib
106import urlparse
107import htmllib
108import formatter
109
110import mimetypes
Guido van Rossum3edbb351997-01-30 03:19:41 +0000111import robotparser
Guido van Rossum272b37d1997-01-30 02:44:48 +0000112
113
114# Tunable parameters
115DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
116MAXPAGE = 50000 # Ignore files bigger than this
117ROUNDSIZE = 50 # Number of links processed per round
118DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
Guido van Rossum3edbb351997-01-30 03:19:41 +0000119AGENTNAME = "webchecker" # Agent name for robots.txt parser
Guido van Rossum272b37d1997-01-30 02:44:48 +0000120
121
122# Global variables
123verbose = 1
124maxpage = MAXPAGE
125roundsize = ROUNDSIZE
126
127
128def main():
129 global verbose, maxpage, roundsize
130 dumpfile = DUMPFILE
131 restart = 0
132
133 try:
134 opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:qr:v')
135 except getopt.error, msg:
136 sys.stdout = sys.stderr
137 print msg
138 print __doc__ % globals()
139 sys.exit(2)
140 for o, a in opts:
141 if o == '-R':
142 restart = 1
143 if o == '-d':
144 dumpfile = a
145 if o == '-m':
146 maxpage = string.atoi(a)
147 if o == '-q':
148 verbose = 0
149 if o == '-r':
150 roundsize = string.atoi(a)
151 if o == '-v':
152 verbose = verbose + 1
153
154 if restart:
155 if verbose > 0:
156 print "Loading checkpoint from %s ..." % dumpfile
157 f = open(dumpfile, "rb")
158 c = pickle.load(f)
159 f.close()
160 if verbose > 0:
161 print "Done."
162 print "Root:", string.join(c.roots, "\n ")
163 else:
164 c = Checker()
165 if not args:
166 args.append(DEFROOT)
167
168 for arg in args:
169 c.addroot(arg)
170
171 if not c.todo:
172 needsave = 0
173 else:
174 needsave = 1
175 try:
176 c.run()
177 except KeyboardInterrupt:
178 if verbose > 0:
179 print "[interrupted]"
180 c.report()
181 if not needsave:
182 if verbose > 0:
183 print
184 print "No need to save checkpoint"
185 elif dumpfile:
186 if verbose > 0:
187 print
188 print "Saving checkpoint to %s ..." % dumpfile
189 newfile = dumpfile + ".new"
190 f = open(newfile, "wb")
191 pickle.dump(c, f)
192 f.flush()
193 f.close()
194 try:
195 os.unlink(dumpfile)
196 except os.error:
197 pass
198 os.rename(newfile, dumpfile)
199 if verbose > 0:
200 print "Done."
201 if dumpfile == DUMPFILE:
202 print "Use ``%s -R'' to restart." % sys.argv[0]
203 else:
204 print "Use ``%s -R -d %s'' to restart." % (sys.argv[0],
205 dumpfile)
206
207
208class Checker:
209
210 def __init__(self):
211 self.roots = []
212 self.todo = {}
213 self.done = {}
214 self.ext = {}
215 self.bad = {}
216 self.urlopener = MyURLopener()
217 self.round = 0
Guido van Rossum3edbb351997-01-30 03:19:41 +0000218 self.robots = {}
219
220 def __getstate__(self):
221 return (self.roots, self.todo, self.done,
222 self.ext, self.bad, self.round)
223
224 def __setstate__(self, state):
225 (self.roots, self.todo, self.done,
226 self.ext, self.bad, self.round) = state
227 for root in self.roots:
228 self.addrobot(root)
Guido van Rossum272b37d1997-01-30 02:44:48 +0000229
230 def addroot(self, root):
231 if root not in self.roots:
232 self.roots.append(root)
233 self.todo[root] = []
Guido van Rossum3edbb351997-01-30 03:19:41 +0000234 self.addrobot(root)
235
236 def addrobot(self, root):
237 self.robots[root] = rp = robotparser.RobotFileParser()
238 if verbose > 3:
239 print "Parsing robots.txt file"
240 rp.debug = 1
241 url = urlparse.urljoin(root, "/robots.txt")
242 rp.set_url(url)
243 rp.read()
Guido van Rossum272b37d1997-01-30 02:44:48 +0000244
245 def run(self):
246 while self.todo:
247 self.round = self.round + 1
248 if verbose > 0:
249 print
250 print "Round", self.round,
251 print "(%d to do, %d done, %d external, %d bad)" % (
252 len(self.todo), len(self.done),
253 len(self.ext), len(self.bad))
254 print
255 urls = self.todo.keys()[:roundsize]
256 for url in urls:
257 self.dopage(url)
258 self.done[url] = self.todo[url]
259 del self.todo[url]
260
261 def report(self):
262 print
263 if not self.todo: print "Final",
264 else: print "Interim",
265 print "Report (%d to do, %d done, %d external, %d bad)" % (
266 len(self.todo), len(self.done),
267 len(self.ext), len(self.bad))
268 if verbose > 0:
269 self.report_extrefs()
270 # Report errors last because the output may get truncated
271 self.report_errors()
272
273 def report_extrefs(self):
274 if not self.ext:
275 print
276 print "No external URLs"
277 return
278 print
279 print "External URLs:"
280 print
281 urls = self.ext.keys()
282 urls.sort()
283 for url in urls:
284 show("HREF ", url, " from", self.ext[url])
285
286 def report_errors(self):
287 if not self.bad:
288 print
289 print "No errors"
290 return
291 print
292 print "Error Report:"
293 urls = self.bad.keys()
294 urls.sort()
295 bysource = {}
296 for url in urls:
297 try:
298 origins = self.done[url]
299 except KeyError:
300 origins = self.todo[url]
301 for source, rawlink in origins:
302 triple = url, rawlink, self.bad[url]
303 try:
304 bysource[source].append(triple)
305 except KeyError:
306 bysource[source] = [triple]
307 sources = bysource.keys()
308 sources.sort()
309 for source in sources:
310 triples = bysource[source]
311 print
312 if len(triples) > 1:
313 print len(triples), "Errors in", source
314 else:
315 print "Error in", source
316 for url, rawlink, msg in triples:
317 print " HREF", url,
318 if rawlink != url: print "(%s)" % rawlink,
319 print
320 print " msg", msg
321
322 def dopage(self, url):
323 if verbose > 1:
324 if verbose > 2:
325 show("Page ", url, " from", self.todo[url])
326 else:
327 print "Page ", url
328 page = self.getpage(url)
329 if not page:
330 return
331 for info in page.getlinkinfos():
332 link, rawlink = info
333 origin = url, rawlink
334 if not self.inroots(link):
335 try:
336 self.ext[link].append(origin)
337 if verbose > 3:
338 print " New ext link", link,
339 if link != rawlink: print "(%s)" % rawlink,
340 print
341 except KeyError:
342 if verbose > 3:
343 print " Seen ext link", link,
344 if link != rawlink: print "(%s)" % rawlink,
345 print
346 self.ext[link] = [origin]
347 elif self.done.has_key(link):
348 if verbose > 3:
349 print " Done link", link
350 self.done[link].append(origin)
351 elif self.todo.has_key(link):
352 if verbose > 3:
353 print " Seen todo link", link
354 self.todo[link].append(origin)
355 else:
356 if verbose > 3:
357 print " New todo link", link
358 self.todo[link] = [origin]
359
360 def inroots(self, url):
361 for root in self.roots:
362 if url[:len(root)] == root:
Guido van Rossum3edbb351997-01-30 03:19:41 +0000363 return self.robots[root].can_fetch(AGENTNAME, url)
Guido van Rossum272b37d1997-01-30 02:44:48 +0000364 return 0
365
366 def getpage(self, url):
367 ctype, encoding = mimetypes.guess_type(url)
368 if encoding:
369 if verbose > 2:
370 print " Won't bother, URL suggests encoding %s" % `encoding`
371 return None
372 if ctype and ctype != 'text/html':
373 if verbose > 2:
374 print " Won't bother, URL suggests mime type %s" % `ctype`
375 return None
376 try:
377 f = self.urlopener.open(url)
378 except IOError, msg:
Guido van Rossum3edbb351997-01-30 03:19:41 +0000379 if (type(msg) == TupleType and
380 len(msg) >= 4 and
381 msg[0] == 'http error' and
382 type(msg[3]) == InstanceType):
383 # Remove the Message instance -- it may contain
384 # a file object which prevents pickling.
385 msg = msg[:3] + msg[4:]
Guido van Rossum272b37d1997-01-30 02:44:48 +0000386 if verbose > 0:
387 print "Error ", msg
388 if verbose > 0:
389 show(" HREF ", url, " from", self.todo[url])
390 self.bad[url] = msg
391 return None
392 nurl = f.geturl()
393 info = f.info()
394 if info.has_key('content-type'):
395 ctype = string.lower(info['content-type'])
396 if nurl != url:
397 if verbose > 1:
Guido van Rossum3edbb351997-01-30 03:19:41 +0000398 print " Redirected to", nurl
Guido van Rossum272b37d1997-01-30 02:44:48 +0000399 if not ctype:
400 ctype, encoding = mimetypes.guess_type(nurl)
401 if ctype != 'text/html':
402 f.close()
403 if verbose > 2:
404 print " Not HTML, mime type", ctype
405 return None
406 text = f.read()
407 f.close()
408 return Page(text, nurl)
409
410
411class Page:
412
413 def __init__(self, text, url):
414 self.text = text
415 self.url = url
416
417 def getlinkinfos(self):
418 size = len(self.text)
419 if size > maxpage:
420 if verbose > 0:
421 print "Skip huge file", self.url
422 print " (%.0f Kbytes)" % (size*0.001)
423 return []
424 if verbose > 2:
425 print " Parsing", self.url, "(%d bytes)" % size
426 parser = MyHTMLParser(formatter.NullFormatter())
427 parser.feed(self.text)
428 parser.close()
429 rawlinks = parser.getlinks()
430 base = urlparse.urljoin(self.url, parser.getbase() or "")
431 infos = []
432 for rawlink in rawlinks:
433 t = urlparse.urlparse(rawlink)
434 t = t[:-1] + ('',)
435 rawlink = urlparse.urlunparse(t)
436 link = urlparse.urljoin(base, rawlink)
437 infos.append((link, rawlink))
438 return infos
439
440
441class MyStringIO(StringIO.StringIO):
442
443 def __init__(self, url, info):
444 self.__url = url
445 self.__info = info
446 StringIO.StringIO.__init__(self)
447
448 def info(self):
449 return self.__info
450
451 def geturl(self):
452 return self.__url
453
454
455class MyURLopener(urllib.FancyURLopener):
456
457 http_error_default = urllib.URLopener.http_error_default
458
459 def open_file(self, url):
460 path = urllib.url2pathname(urllib.unquote(url))
461 if path[-1] != os.sep:
462 url = url + '/'
463 if os.path.isdir(path):
464 indexpath = os.path.join(path, "index.html")
465 if os.path.exists(indexpath):
466 return self.open_file(url + "index.html")
467 try:
468 names = os.listdir(path)
469 except os.error, msg:
470 raise IOError, msg, sys.exc_traceback
471 names.sort()
472 s = MyStringIO("file:"+url, {'content-type': 'text/html'})
473 s.write('<BASE HREF="file:%s">\n' %
474 urllib.quote(os.path.join(path, "")))
475 for name in names:
476 q = urllib.quote(name)
477 s.write('<A HREF="%s">%s</A>\n' % (q, q))
478 s.seek(0)
479 return s
480 return urllib.FancyURLopener.open_file(self, path)
481
482
483class MyHTMLParser(htmllib.HTMLParser):
484
485 def __init__(*args):
486 self = args[0]
487 self.base = None
488 self.links = []
489 apply(htmllib.HTMLParser.__init__, args)
490
491 def start_a(self, attributes):
492 for name, value in attributes:
493 if name == 'href' and value and value not in self.links:
494 self.links.append(string.strip(value))
495
496 def do_base(self, attributes):
497 for name, value in attributes:
498 if name == 'href' and value:
499 if verbose > 1:
500 print " Base", value
501 self.base = value
502
503 def getlinks(self):
504 return self.links
505
506 def getbase(self):
507 return self.base
508
509
510def show(p1, link, p2, origins):
511 print p1, link
512 i = 0
513 for source, rawlink in origins:
514 i = i+1
515 if i == 2:
516 p2 = ' '*len(p2)
517 print p2, source,
518 if rawlink != link: print "(%s)" % rawlink,
519 print
520
521
522if __name__ == '__main__':
523 main()