blob: 981c0eb1d94a5021fcb3b892b9143dd80a21d8a4 [file] [log] [blame]
Guido van Rossum54500621999-03-24 19:09:00 +00001#! /usr/bin/env python
2
3# Modifications where indicated by Samuel Bayer (SLB), 2/24/99,
4# to support checking of links to internal NAME anchors. Key
5# modifications are in the dopage() method of the Checker class,
6# where the fragments (internal NAME anchors) associated with
7# a page are checked, and in the getlinkinfos() method of the
8# Page class, where the fragment element of the result of
9# urlparse() is saved, rather than discarded. The global effect
10# of this change is to make the entries in the "done" and
11# "todo" instance variables of Checker into a pair (URL, fragment)
12# rather than simply a URL. This change has implications in
13# the following methods:
14
15# addroot() of Checker
16# report_errors() of Checker
17# dopage() of Checker
18# newdonelink() of Checker
19# newtodolink() of Checker
20# format_url() of Checker (new)
21# getpage() of Checker
22# readhtml() of Checker
23# openhtml() of Checker
24# openpage() of Checker
25# seterror() of Checker
26# __init__() of Page()
27# getnames() of Page() (new)
28# getlinkinfos() of Page()
29# start_a() of MyHTMLParser()
30
31# Also added new command line argument to support specification of
32# additional roots, for the circumstance where a root the root index.html
33# page is not in a directory which dominates the entire tree of
34# relevant internal references.
35
36"""Web tree checker.
37
38This utility is handy to check a subweb of the world-wide web for
39errors. A subweb is specified by giving one or more ``root URLs''; a
40page belongs to the subweb if one of the root URLs is an initial
41prefix of it.
42
43File URL extension:
44
45In order to easy the checking of subwebs via the local file system,
46the interpretation of ``file:'' URLs is extended to mimic the behavior
47of your average HTTP daemon: if a directory pathname is given, the
48file index.html in that directory is returned if it exists, otherwise
49a directory listing is returned. Now, you can point webchecker to the
50document tree in the local file system of your HTTP daemon, and have
51most of it checked. In fact the default works this way if your local
52web tree is located at /usr/local/etc/httpd/htdpcs (the default for
53the NCSA HTTP daemon and probably others).
54
55Report printed:
56
57When done, it reports pages with bad links within the subweb. When
58interrupted, it reports for the pages that it has checked already.
59
60In verbose mode, additional messages are printed during the
61information gathering phase. By default, it prints a summary of its
62work status every 50 URLs (adjustable with the -r option), and it
63reports errors as they are encountered. Use the -q option to disable
64this output.
65
66Checkpoint feature:
67
68Whether interrupted or not, it dumps its state (a Python pickle) to a
69checkpoint file and the -R option allows it to restart from the
70checkpoint (assuming that the pages on the subweb that were already
71processed haven't changed). Even when it has run till completion, -R
72can still be useful -- it will print the reports again, and -Rq prints
73the errors only. In this case, the checkpoint file is not written
74again. The checkpoint file can be set with the -d option.
75
76The checkpoint file is written as a Python pickle. Remember that
77Python's pickle module is currently quite slow. Give it the time it
78needs to load and save the checkpoint file. When interrupted while
79writing the checkpoint file, the old checkpoint file is not
80overwritten, but all work done in the current run is lost.
81
82Miscellaneous:
83
84- You may find the (Tk-based) GUI version easier to use. See wcgui.py.
85
86- Webchecker honors the "robots.txt" convention. Thanks to Skip
87Montanaro for his robotparser.py module (included in this directory)!
88The agent name is hardwired to "webchecker". URLs that are disallowed
89by the robots.txt file are reported as external URLs.
90
91- Because the SGML parser is a bit slow, very large SGML files are
92skipped. The size limit can be set with the -m option.
93
94- When the server or protocol does not tell us a file's type, we guess
95it based on the URL's suffix. The mimetypes.py module (also in this
96directory) has a built-in table mapping most currently known suffixes,
97and in addition attempts to read the mime.types configuration files in
98the default locations of Netscape and the NCSA HTTP daemon.
99
100- We follow links indicated by <A>, <FRAME> and <IMG> tags. We also
101honor the <BASE> tag.
102
103- (SLB 2/24/99) We now check internal NAME anchor links, as well
104as toplevel links.
105
106- Checking external links is now done by default; use -x to *disable*
107this feature. External links are now checked during normal
108processing. (XXX The status of a checked link could be categorized
109better. Later...)
110
111- (SLB 2/24/99) If external links are not checked, you can use the -t
112flag to provide specific overrides to -x.
113
114Usage: webchecker.py [option] ... [rooturl] ...
115
116Options:
117
118-R -- restart from checkpoint file
119-d file -- checkpoint filename (default %(DUMPFILE)s)
120-m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d)
121-n -- reports only, no checking (use with -R)
122-q -- quiet operation (also suppresses external links report)
123-r number -- number of links processed per round (default %(ROUNDSIZE)d)
124-t root -- specify root dir which should be treated as internal (can repeat)
125-v -- verbose operation; repeating -v will increase verbosity
126-x -- don't check external links (these are often slow to check)
127
128Arguments:
129
130rooturl -- URL to start checking
131 (default %(DEFROOT)s)
132
133"""
134
135
136__version__ = "$Revision$"
137
138
139import sys
140import os
141from types import *
142import string
143import StringIO
144import getopt
145import pickle
146
147import urllib
148import urlparse
149import sgmllib
150
151import mimetypes
152import robotparser
153
154# Extract real version number if necessary
155if __version__[0] == '$':
156 _v = string.split(__version__)
157 if len(_v) == 3:
158 __version__ = _v[1]
159
160
161# Tunable parameters
162DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
163CHECKEXT = 1 # Check external references (1 deep)
164VERBOSE = 1 # Verbosity level (0-3)
165MAXPAGE = 150000 # Ignore files bigger than this
166ROUNDSIZE = 50 # Number of links processed per round
167DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
168AGENTNAME = "webchecker" # Agent name for robots.txt parser
169
170
171# Global variables
172
173
174def main():
175 checkext = CHECKEXT
176 verbose = VERBOSE
177 maxpage = MAXPAGE
178 roundsize = ROUNDSIZE
179 dumpfile = DUMPFILE
180 restart = 0
181 norun = 0
182
183 try:
184
185 # Begin SLB 2/24/99: Added -t option here.
186 opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vx')
187 # End SLB 2/24/99
188
189 except getopt.error, msg:
190 sys.stdout = sys.stderr
191 print msg
192 print __doc__%globals()
193 sys.exit(2)
194
195 # Begin SLB 2/24/99: Added extra_roots variable to
196 # collect extra roots.
197 extra_roots = []
198 # End SLB 2/24/99
199
200 for o, a in opts:
201 if o == '-R':
202 restart = 1
203 if o == '-d':
204 dumpfile = a
205 if o == '-m':
206 maxpage = string.atoi(a)
207 if o == '-n':
208 norun = 1
209 if o == '-q':
210 verbose = 0
211 if o == '-r':
212 roundsize = string.atoi(a)
213
214 # Begin SLB 2/24/99: Added processing for
215 # -t option.
216 if o == '-t':
217 extra_roots.append(a)
218 # End SLB 2/24/99
219
220 if o == '-v':
221 verbose = verbose + 1
222 if o == '-x':
223 checkext = not checkext
224
225 if verbose > 0:
226 print AGENTNAME, "version", __version__
227
228 if restart:
229 c = load_pickle(dumpfile=dumpfile, verbose=verbose)
230 else:
231 c = Checker()
232
233 c.setflags(checkext=checkext, verbose=verbose,
234 maxpage=maxpage, roundsize=roundsize)
235
236 if not restart and not args:
237 args.append(DEFROOT)
238
239 for arg in args:
240 c.addroot(arg)
241
242 # Begin SLB 2/24/99. The -t flag is only needed if external
243 # links are not to be checked. So -t values are ignored unless
244 # -x was specified.
245 if not checkext:
246 for root in extra_roots:
247 # Make sure it's terminated by a slash,
248 # so that addroot doesn't discard the last
249 # directory component.
250 if root[-1] != "/":
251 root = root + "/"
252 c.addroot(root)
253 # End SLB 2/24/99
254
255 try:
256
257 if not norun:
258 try:
259 c.run()
260 except KeyboardInterrupt:
261 if verbose > 0:
262 print "[run interrupted]"
263
264 try:
265 c.report()
266 except KeyboardInterrupt:
267 if verbose > 0:
268 print "[report interrupted]"
269
270 finally:
271 if c.save_pickle(dumpfile):
272 if dumpfile == DUMPFILE:
273 print "Use ``%s -R'' to restart." % sys.argv[0]
274 else:
275 print "Use ``%s -R -d %s'' to restart." % (sys.argv[0],
276 dumpfile)
277
278
279def load_pickle(dumpfile=DUMPFILE, verbose=VERBOSE):
280 if verbose > 0:
281 print "Loading checkpoint from %s ..." % dumpfile
282 f = open(dumpfile, "rb")
283 c = pickle.load(f)
284 f.close()
285 if verbose > 0:
286 print "Done."
287 print "Root:", string.join(c.roots, "\n ")
288 return c
289
290
291class Checker:
292
293 checkext = CHECKEXT
294 verbose = VERBOSE
295 maxpage = MAXPAGE
296 roundsize = ROUNDSIZE
297
298 validflags = tuple(dir())
299
300 def __init__(self):
301 self.reset()
302
303 def setflags(self, **kw):
304 for key in kw.keys():
305 if key not in self.validflags:
306 raise NameError, "invalid keyword argument: %s" % str(key)
307 for key, value in kw.items():
308 setattr(self, key, value)
309
310 def reset(self):
311 self.roots = []
312 self.todo = {}
313 self.done = {}
314 self.bad = {}
315
316 # Begin SLB 2/24/99: Added a name table, so that the
317 # name URLs can be checked. Also serves as an implicit
318 # cache for which URLs are done.
319 self.name_table = {}
320 # End SLB 2/24/99
321
322 self.round = 0
323 # The following are not pickled:
324 self.robots = {}
325 self.errors = {}
326 self.urlopener = MyURLopener()
327 self.changed = 0
328
329 def note(self, level, format, *args):
330 if self.verbose > level:
331 if args:
332 format = format%args
333 self.message(format)
334
335 def message(self, format, *args):
336 if args:
337 format = format%args
338 print format
339
340 def __getstate__(self):
341 return (self.roots, self.todo, self.done, self.bad, self.round)
342
343 def __setstate__(self, state):
344 self.reset()
345 (self.roots, self.todo, self.done, self.bad, self.round) = state
346 for root in self.roots:
347 self.addrobot(root)
348 for url in self.bad.keys():
349 self.markerror(url)
350
351 def addroot(self, root):
352 if root not in self.roots:
353 troot = root
354 scheme, netloc, path, params, query, fragment = \
355 urlparse.urlparse(root)
356 i = string.rfind(path, "/") + 1
357 if 0 < i < len(path):
358 path = path[:i]
359 troot = urlparse.urlunparse((scheme, netloc, path,
360 params, query, fragment))
361 self.roots.append(troot)
362 self.addrobot(root)
363 # Begin SLB 2/24/99: Modified this call to respect
364 # the fact that the "done" and "todo" dictionaries
365 # are now (URL, fragment) pairs
366 self.newlink((root, ""), ("<root>", root))
367 # End SLB 2/24/99
368
369 def addrobot(self, root):
370 root = urlparse.urljoin(root, "/")
371 if self.robots.has_key(root): return
372 url = urlparse.urljoin(root, "/robots.txt")
373 self.robots[root] = rp = robotparser.RobotFileParser()
374 self.note(2, "Parsing %s", url)
375 rp.debug = self.verbose > 3
376 rp.set_url(url)
377 try:
378 rp.read()
379 except IOError, msg:
380 self.note(1, "I/O error parsing %s: %s", url, msg)
381
382 def run(self):
383 while self.todo:
384 self.round = self.round + 1
385 self.note(0, "\nRound %d (%s)\n", self.round, self.status())
386 urls = self.todo.keys()
387 urls.sort()
388 del urls[self.roundsize:]
389 for url in urls:
390 self.dopage(url)
391
392 def status(self):
393 return "%d total, %d to do, %d done, %d bad" % (
394 len(self.todo)+len(self.done),
395 len(self.todo), len(self.done),
396 len(self.bad))
397
398 def report(self):
399 self.message("")
400 if not self.todo: s = "Final"
401 else: s = "Interim"
402 self.message("%s Report (%s)", s, self.status())
403 self.report_errors()
404
405 def report_errors(self):
406 if not self.bad:
407 self.message("\nNo errors")
408 return
409 self.message("\nError Report:")
410 sources = self.errors.keys()
411 sources.sort()
412 for source in sources:
413 triples = self.errors[source]
414 self.message("")
415 if len(triples) > 1:
416 self.message("%d Errors in %s", len(triples), source)
417 else:
418 self.message("Error in %s", source)
419 # Begin SLB 2/24/99: Modified this loop to
420 # call self.format_url() instead of referring
421 # to the URL directly, since the URLs in these
422 # triples is now a (URL, fragment) pair. The value
423 # of the "source" variable comes from the list of
424 # origins, and is a URL, not a pair.
425 for url, rawlink, msg in triples:
426 if rawlink != self.format_url(url): s = " (%s)" % rawlink
427 else: s = ""
428 self.message(" HREF %s%s\n msg %s",
429 self.format_url(url), s, msg)
430 # End SLB 2/24/99.
431
432 def dopage(self, url_pair):
433
434 # Begin SLB 2/24/99: Substantial modification to reflect the change
435 # to dictionary entries being (URL, fragment) pairs. All
436 # printing of URLs uses format_url(); argument changed to
437 # url_pair for clarity.
438 if self.verbose > 1:
439 if self.verbose > 2:
440 self.show("Check ", self.format_url(url_pair),
441 " from", self.todo[url_pair])
442 else:
443 self.message("Check %s", self.format_url(url_pair))
444 page = self.getpage(url_pair)
445 if page:
446 url, local_fragment = url_pair
447 # Store the page which corresponds to this URL.
448 self.name_table[url] = page
449 # If there is a fragment in this url_pair, and it's not
450 # in the list of names for the page, call setbad(), since
451 # it's a missing anchor.
452 if local_fragment and local_fragment not in page.getnames():
453 self.setbad(url_pair, ("Missing name anchor `%s'" % local_fragment))
454 for info in page.getlinkinfos():
455 # getlinkinfos() now returns the fragment as well,
456 # and we store that fragment here in the "todo" dictionary.
457 link, rawlink, fragment = info
458 # However, we don't want the fragment as the origin, since
459 # the origin is logically a page.
460 origin = url, rawlink
461 self.newlink((link, fragment), origin)
462 else:
463 # If no page has been created yet, we want to
464 # record that fact.
465 self.name_table[url_pair[0]] = None
466 self.markdone(url_pair)
467 # End SLB 2/24/99
468
469 def newlink(self, url, origin):
470 if self.done.has_key(url):
471 self.newdonelink(url, origin)
472 else:
473 self.newtodolink(url, origin)
474
475 def newdonelink(self, url, origin):
476 self.done[url].append(origin)
477
478 # Begin SLB 2/24/99: changed reference to URL
479 # to call self.format_url(), since the URL here
480 # is now a (URL, fragment) pair.
481 self.note(3, " Done link %s", self.format_url(url))
482 # End SLB 2/24/99
483
484 def newtodolink(self, url, origin):
485
486 # Begin SLB 2/24/99: changed reference to URL
487 # to call self.format_url(), since the URL here
488 # is now a (URL, fragment) pair.
489 if self.todo.has_key(url):
490 self.todo[url].append(origin)
491 self.note(3, " Seen todo link %s", self.format_url(url))
492 else:
493 self.todo[url] = [origin]
494 self.note(3, " New todo link %s", self.format_url(url))
495 # End SLB 2/24/99
496
497 # Begin SLB 2/24/99: Added method format_url().
498 def format_url(self, url):
499 link, fragment = url
500 if fragment: return link + "#" + fragment
501 else: return link
502 # End SLB 2/24/99
503
504 def markdone(self, url):
505 self.done[url] = self.todo[url]
506 del self.todo[url]
507 self.changed = 1
508
509 def inroots(self, url):
510 for root in self.roots:
511 if url[:len(root)] == root:
512 return self.isallowed(root, url)
513 return 0
514
515 def isallowed(self, root, url):
516 root = urlparse.urljoin(root, "/")
517 return self.robots[root].can_fetch(AGENTNAME, url)
518
519 def getpage(self, url_pair):
520
521 # Begin SLB 2/24/99: changed incoming argument name to
522 # url_pair, since it's now a (URL, fragment) pair.
523 # The page may have been cached in the name_table variable.
524 url, fragment = url_pair
525 if self.name_table.has_key(url):
526 return self.name_table[url]
527 # End SLB 2/24/99
528
529 if url[:7] == 'mailto:' or url[:5] == 'news:':
530 self.note(1, " Not checking mailto/news URL")
531 return None
532 isint = self.inroots(url)
533
534 # Begin SLB 2/24/99: Changed calls to openpage and readhtml
535 # in order to ensure that openpage gets the URL pair to
536 # print out its error message and record the error pair
537 # correctly.
538 if not isint:
539 if not self.checkext:
540 self.note(1, " Not checking ext link")
541 return None
542 f = self.openpage(url_pair)
543 if f:
544 self.safeclose(f)
545 return None
546 text, nurl = self.readhtml(url_pair)
547 # End SLB 2/24/99
548
549 if nurl != url:
550 self.note(1, " Redirected to %s", nurl)
551 url = nurl
552 if text:
553 return Page(text, url, maxpage=self.maxpage, checker=self)
554
555 # Begin SLB 2/24/99: Modified these next three functions
556 # to take (URL, fragment) pairs as arguments, so that openpage()
557 # receives the appropriate tuple to record error messages.
558 def readhtml(self, url_pair):
559 url, fragment = url_pair
560 text = None
561 f, url = self.openhtml(url_pair)
562 if f:
563 text = f.read()
564 f.close()
565 return text, url
566
567 def openhtml(self, url_pair):
568 url, fragment = url_pair
569 f = self.openpage(url_pair)
570 if f:
571 url = f.geturl()
572 info = f.info()
573 if not self.checkforhtml(info, url):
574 self.safeclose(f)
575 f = None
576 return f, url
577
578 def openpage(self, url_pair):
579 url, fragment = url_pair
580 try:
581 return self.urlopener.open(url)
582 except IOError, msg:
583 msg = self.sanitize(msg)
584 self.note(0, "Error %s", msg)
585 if self.verbose > 0:
586 self.show(" HREF ", url, " from", self.todo[url_pair])
587 self.setbad(url_pair, msg)
588 return None
589 # End SLB 2/24/99
590
591 def checkforhtml(self, info, url):
592 if info.has_key('content-type'):
593 ctype = string.lower(info['content-type'])
594 else:
595 if url[-1:] == "/":
596 return 1
597 ctype, encoding = mimetypes.guess_type(url)
598 if ctype == 'text/html':
599 return 1
600 else:
601 self.note(1, " Not HTML, mime type %s", ctype)
602 return 0
603
604 def setgood(self, url):
605 if self.bad.has_key(url):
606 del self.bad[url]
607 self.changed = 1
608 self.note(0, "(Clear previously seen error)")
609
610 def setbad(self, url, msg):
611 if self.bad.has_key(url) and self.bad[url] == msg:
612 self.note(0, "(Seen this error before)")
613 return
614 self.bad[url] = msg
615 self.changed = 1
616 self.markerror(url)
617
618 def markerror(self, url):
619 try:
620 origins = self.todo[url]
621 except KeyError:
622 origins = self.done[url]
623 for source, rawlink in origins:
624 triple = url, rawlink, self.bad[url]
625 self.seterror(source, triple)
626
627 def seterror(self, url, triple):
628 try:
629
630 # Begin SLB 2/24/99: Because of the way the
631 # URLs are now processed, I need to check to make
632 # sure the URL hasn't been entered in the error list.
633 # The first element of the triple here is a (URL, fragment)
634 # pair, but the URL key is not, since it's from the
635 # list of origins.
636 if triple not in self.errors[url]:
637 self.errors[url].append(triple)
638 # End SLB 2/24/99
639
640 except KeyError:
641 self.errors[url] = [triple]
642
643 # The following used to be toplevel functions; they have been
644 # changed into methods so they can be overridden in subclasses.
645
646 def show(self, p1, link, p2, origins):
647 self.message("%s %s", p1, link)
648 i = 0
649 for source, rawlink in origins:
650 i = i+1
651 if i == 2:
652 p2 = ' '*len(p2)
653 if rawlink != link: s = " (%s)" % rawlink
654 else: s = ""
655 self.message("%s %s%s", p2, source, s)
656
657 def sanitize(self, msg):
658 if isinstance(IOError, ClassType) and isinstance(msg, IOError):
659 # Do the other branch recursively
660 msg.args = self.sanitize(msg.args)
661 elif isinstance(msg, TupleType):
662 if len(msg) >= 4 and msg[0] == 'http error' and \
663 isinstance(msg[3], InstanceType):
664 # Remove the Message instance -- it may contain
665 # a file object which prevents pickling.
666 msg = msg[:3] + msg[4:]
667 return msg
668
669 def safeclose(self, f):
670 try:
671 url = f.geturl()
672 except AttributeError:
673 pass
674 else:
675 if url[:4] == 'ftp:' or url[:7] == 'file://':
676 # Apparently ftp connections don't like to be closed
677 # prematurely...
678 text = f.read()
679 f.close()
680
681 def save_pickle(self, dumpfile=DUMPFILE):
682 if not self.changed:
683 self.note(0, "\nNo need to save checkpoint")
684 elif not dumpfile:
685 self.note(0, "No dumpfile, won't save checkpoint")
686 else:
687 self.note(0, "\nSaving checkpoint to %s ...", dumpfile)
688 newfile = dumpfile + ".new"
689 f = open(newfile, "wb")
690 pickle.dump(self, f)
691 f.close()
692 try:
693 os.unlink(dumpfile)
694 except os.error:
695 pass
696 os.rename(newfile, dumpfile)
697 self.note(0, "Done.")
698 return 1
699
700
701class Page:
702
703 def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE, checker=None):
704 self.text = text
705 self.url = url
706 self.verbose = verbose
707 self.maxpage = maxpage
708 self.checker = checker
709
710 # Begin SLB 2/24/99: Moved the parsing of the page into
711 # the __init__() routine in order to initialize the list of
712 # names the file contains. Stored the parser in an instance
713 # variable. Passed the URL to MyHTMLParser().
714 size = len(self.text)
715 if size > self.maxpage:
716 self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001))
717 self.parser = None
718 return
719 self.checker.note(2, " Parsing %s (%d bytes)", self.url, size)
720 self.parser = MyHTMLParser(url, verbose=self.verbose,
721 checker=self.checker)
722 self.parser.feed(self.text)
723 self.parser.close()
724 # End SLB 2/24/99
725
726 def note(self, level, msg, *args):
727 if self.checker:
728 apply(self.checker.note, (level, msg) + args)
729 else:
730 if self.verbose >= level:
731 if args:
732 msg = msg%args
733 print msg
734
735 # Begin SLB 2/24/99: Added method to retrieve names.
736 def getnames(self):
737 return self.parser.names
738 # End SLB 2/24/99
739
740 def getlinkinfos(self):
741
742 # Begin SLB 2/24/99: Moved file reading to __init__() routine.
743 # Stored parser in local variable to indicate success of
744 # parsing.
745
746 # If no parser was stored, fail.
747 if not self.parser: return []
748
749 rawlinks = self.parser.getlinks()
750 base = urlparse.urljoin(self.url, self.parser.getbase() or "")
751 infos = []
752 for rawlink in rawlinks:
753 t = urlparse.urlparse(rawlink)
754 # DON'T DISCARD THE FRAGMENT! Instead, include
755 # it in the tuples which are returned. See Checker.dopage().
756 fragment = t[-1]
757 t = t[:-1] + ('',)
758 rawlink = urlparse.urlunparse(t)
759 link = urlparse.urljoin(base, rawlink)
760 infos.append((link, rawlink, fragment))
761 # End SLB 2/24/99
762
763 return infos
764
765
766class MyStringIO(StringIO.StringIO):
767
768 def __init__(self, url, info):
769 self.__url = url
770 self.__info = info
771 StringIO.StringIO.__init__(self)
772
773 def info(self):
774 return self.__info
775
776 def geturl(self):
777 return self.__url
778
779
780class MyURLopener(urllib.FancyURLopener):
781
782 http_error_default = urllib.URLopener.http_error_default
783
784 def __init__(*args):
785 self = args[0]
786 apply(urllib.FancyURLopener.__init__, args)
787 self.addheaders = [
788 ('User-agent', 'Python-webchecker/%s' % __version__),
789 ]
790
791 def http_error_401(self, url, fp, errcode, errmsg, headers):
792 return None
793
794 def open_file(self, url):
795 path = urllib.url2pathname(urllib.unquote(url))
796 if path[-1] != os.sep:
797 url = url + '/'
798 if os.path.isdir(path):
799 indexpath = os.path.join(path, "index.html")
800 if os.path.exists(indexpath):
801 return self.open_file(url + "index.html")
802 try:
803 names = os.listdir(path)
804 except os.error, msg:
805 raise IOError, msg, sys.exc_traceback
806 names.sort()
807 s = MyStringIO("file:"+url, {'content-type': 'text/html'})
808 s.write('<BASE HREF="file:%s">\n' %
809 urllib.quote(os.path.join(path, "")))
810 for name in names:
811 q = urllib.quote(name)
812 s.write('<A HREF="%s">%s</A>\n' % (q, q))
813 s.seek(0)
814 return s
815 return urllib.FancyURLopener.open_file(self, path)
816
817
818class MyHTMLParser(sgmllib.SGMLParser):
819
820 def __init__(self, url, verbose=VERBOSE, checker=None):
821 self.myverbose = verbose # now unused
822 self.checker = checker
823 self.base = None
824 self.links = {}
825
826 # Begin SLB 2/24/99: Added names instance variable.
827 # Modified arglist to take the URL as an argument.
828 self.names = []
829 self.url = url
830 # End SLB 2/24/99
831
832 sgmllib.SGMLParser.__init__(self)
833
834 def start_a(self, attributes):
835 self.link_attr(attributes, 'href')
836
837 # Begin SLB 2/24/99: We must rescue the NAME
838 # attributes from the anchor, in order to
839 # cache the internal anchors which are made
840 # available in the page.
841 for name, value in attributes:
842 if name == "name":
843 if value in self.names:
844 self.checker.message("WARNING: duplicate name %s in %s",
845 value, self.url)
846 else: self.names.append(value)
847 break
848 # End SLB 2/24/99
849
850 def end_a(self): pass
851
852 def do_area(self, attributes):
853 self.link_attr(attributes, 'href')
854
855 def do_img(self, attributes):
856 self.link_attr(attributes, 'src', 'lowsrc')
857
858 def do_frame(self, attributes):
859 self.link_attr(attributes, 'src')
860
861 def link_attr(self, attributes, *args):
862 for name, value in attributes:
863 if name in args:
864 if value: value = string.strip(value)
865 if value: self.links[value] = None
866
867 def do_base(self, attributes):
868 for name, value in attributes:
869 if name == 'href':
870 if value: value = string.strip(value)
871 if value:
872 if self.checker:
873 self.checker.note(1, " Base %s", value)
874 self.base = value
875
876 def getlinks(self):
877 return self.links.keys()
878
879 def getbase(self):
880 return self.base
881
882
883if __name__ == '__main__':
884 main()