| #! /usr/bin/env python |
| |
| """A variant on webchecker that creates a mirror copy of a remote site.""" |
| |
| __version__ = "$Revision$" |
| |
| import os |
| import sys |
| import urllib |
| import getopt |
| |
| import webchecker |
| |
| # Extract real version number if necessary |
| if __version__[0] == '$': |
| _v = __version__.split() |
| if len(_v) == 3: |
| __version__ = _v[1] |
| |
| def main(): |
| verbose = webchecker.VERBOSE |
| try: |
| opts, args = getopt.getopt(sys.argv[1:], "qv") |
| except getopt.error, msg: |
| print msg |
| print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..." |
| return 2 |
| for o, a in opts: |
| if o == "-q": |
| verbose = 0 |
| if o == "-v": |
| verbose = verbose + 1 |
| c = Sucker() |
| c.setflags(verbose=verbose) |
| c.urlopener.addheaders = [ |
| ('User-agent', 'websucker/%s' % __version__), |
| ] |
| for arg in args: |
| print "Adding root", arg |
| c.addroot(arg) |
| print "Run..." |
| c.run() |
| |
| class Sucker(webchecker.Checker): |
| |
| checkext = 0 |
| nonames = 1 |
| |
| # SAM 11/13/99: in general, URLs are now URL pairs. |
| # Since we've suppressed name anchor checking, |
| # we can ignore the second dimension. |
| |
| def readhtml(self, url_pair): |
| url = url_pair[0] |
| text = None |
| path = self.savefilename(url) |
| try: |
| f = open(path, "rb") |
| except IOError: |
| f = self.openpage(url_pair) |
| if f: |
| info = f.info() |
| nurl = f.geturl() |
| if nurl != url: |
| url = nurl |
| path = self.savefilename(url) |
| text = f.read() |
| f.close() |
| self.savefile(text, path) |
| if not self.checkforhtml(info, url): |
| text = None |
| else: |
| if self.checkforhtml({}, url): |
| text = f.read() |
| f.close() |
| return text, url |
| |
| def savefile(self, text, path): |
| dir, base = os.path.split(path) |
| makedirs(dir) |
| try: |
| f = open(path, "wb") |
| f.write(text) |
| f.close() |
| self.message("saved %s", path) |
| except IOError, msg: |
| self.message("didn't save %s: %s", path, str(msg)) |
| |
| def savefilename(self, url): |
| type, rest = urllib.splittype(url) |
| host, path = urllib.splithost(rest) |
| path = path.lstrip("/") |
| user, host = urllib.splituser(host) |
| host, port = urllib.splitnport(host) |
| host = host.lower() |
| if not path or path[-1] == "/": |
| path = path + "index.html" |
| if os.sep != "/": |
| path = os.sep.join(path.split("/")) |
| if os.name == "mac": |
| path = os.sep + path |
| path = os.path.join(host, path) |
| return path |
| |
| def makedirs(dir): |
| if not dir: |
| return |
| if os.path.exists(dir): |
| if not os.path.isdir(dir): |
| try: |
| os.rename(dir, dir + ".bak") |
| os.mkdir(dir) |
| os.rename(dir + ".bak", os.path.join(dir, "index.html")) |
| except os.error: |
| pass |
| return |
| head, tail = os.path.split(dir) |
| if not tail: |
| print "Huh? Don't know how to make dir", dir |
| return |
| makedirs(head) |
| os.mkdir(dir, 0777) |
| |
| if __name__ == '__main__': |
| sys.exit(main() or 0) |