blob: 67e493dc30c412eb8ab2263159942c99aba51ef1 [file] [log] [blame]
Guido van Rossumd5754801997-10-06 18:54:25 +00001#! /usr/bin/env python
2
3"""A variant on webchecker that creates a mirror copy of a remote site."""
4
Guido van Rossum1a7eae91998-02-21 20:08:39 +00005__version__ = "$Revision$"
Guido van Rossumd5754801997-10-06 18:54:25 +00006
7import os
8import sys
9import string
10import urllib
11import getopt
12
13import webchecker
Guido van Rossum1a7eae91998-02-21 20:08:39 +000014
15# Extract real version number if necessary
16if __version__[0] == '$':
17 _v = string.split(__version__)
18 if len(_v) == 3:
Guido van Rossum986abac1998-04-06 14:29:28 +000019 __version__ = _v[1]
Guido van Rossumd5754801997-10-06 18:54:25 +000020
21def main():
Guido van Rossum1a7eae91998-02-21 20:08:39 +000022 verbose = webchecker.VERBOSE
Guido van Rossumd5754801997-10-06 18:54:25 +000023 try:
Guido van Rossum986abac1998-04-06 14:29:28 +000024 opts, args = getopt.getopt(sys.argv[1:], "qv")
Guido van Rossumd5754801997-10-06 18:54:25 +000025 except getopt.error, msg:
Guido van Rossum986abac1998-04-06 14:29:28 +000026 print msg
27 print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
28 return 2
Guido van Rossumd5754801997-10-06 18:54:25 +000029 for o, a in opts:
Guido van Rossum986abac1998-04-06 14:29:28 +000030 if o == "-q":
31 verbose = 0
32 if o == "-v":
33 verbose = verbose + 1
Guido van Rossum1a7eae91998-02-21 20:08:39 +000034 c = Sucker()
35 c.setflags(verbose=verbose)
Guido van Rossumd5754801997-10-06 18:54:25 +000036 c.urlopener.addheaders = [
Guido van Rossum986abac1998-04-06 14:29:28 +000037 ('User-agent', 'websucker/%s' % __version__),
38 ]
Guido van Rossumd5754801997-10-06 18:54:25 +000039 for arg in args:
Guido van Rossum986abac1998-04-06 14:29:28 +000040 print "Adding root", arg
41 c.addroot(arg)
Guido van Rossumd5754801997-10-06 18:54:25 +000042 print "Run..."
43 c.run()
44
45class Sucker(webchecker.Checker):
46
Guido van Rossum1a7eae91998-02-21 20:08:39 +000047 checkext = 0
48
49 def readhtml(self, url):
Guido van Rossum986abac1998-04-06 14:29:28 +000050 text = None
51 path = self.savefilename(url)
52 try:
53 f = open(path, "rb")
54 except IOError:
55 f = self.openpage(url)
56 if f:
57 info = f.info()
58 nurl = f.geturl()
59 if nurl != url:
60 url = nurl
61 path = self.savefilename(url)
62 text = f.read()
63 f.close()
64 self.savefile(text, path)
65 if not self.checkforhtml(info, url):
66 text = None
67 else:
68 if self.checkforhtml({}, url):
69 text = f.read()
70 f.close()
71 return text, url
Guido van Rossumd5754801997-10-06 18:54:25 +000072
73 def savefile(self, text, path):
Guido van Rossum986abac1998-04-06 14:29:28 +000074 dir, base = os.path.split(path)
75 makedirs(dir)
Guido van Rossum909bc181999-01-03 13:06:00 +000076 try:
77 f = open(path, "wb")
78 f.write(text)
79 f.close()
80 self.message("saved %s", path)
81 except IOError, msg:
82 self.message("didn't save %s: %s", path, str(msg))
Guido van Rossumd5754801997-10-06 18:54:25 +000083
84 def savefilename(self, url):
Guido van Rossum986abac1998-04-06 14:29:28 +000085 type, rest = urllib.splittype(url)
86 host, path = urllib.splithost(rest)
87 while path[:1] == "/": path = path[1:]
88 user, host = urllib.splituser(host)
89 host, port = urllib.splitnport(host)
90 host = string.lower(host)
Guido van Rossumd328a9b1998-06-15 12:34:41 +000091 if not path or path[-1] == "/":
92 path = path + "index.html"
Guido van Rossum986abac1998-04-06 14:29:28 +000093 if os.sep != "/":
94 path = string.join(string.split(path, "/"), os.sep)
Guido van Rossumd328a9b1998-06-15 12:34:41 +000095 path = os.path.join(host, path)
Guido van Rossum986abac1998-04-06 14:29:28 +000096 return path
Guido van Rossumd5754801997-10-06 18:54:25 +000097
98def makedirs(dir):
Guido van Rossum909bc181999-01-03 13:06:00 +000099 if not dir:
100 return
101 if os.path.exists(dir):
102 if not os.path.isdir(dir):
103 try:
104 os.rename(dir, dir + ".bak")
105 os.mkdir(dir)
106 os.rename(dir + ".bak", os.path.join(dir, "index.html"))
107 except os.error:
108 pass
Guido van Rossum986abac1998-04-06 14:29:28 +0000109 return
Guido van Rossumd5754801997-10-06 18:54:25 +0000110 head, tail = os.path.split(dir)
111 if not tail:
Guido van Rossum986abac1998-04-06 14:29:28 +0000112 print "Huh? Don't know how to make dir", dir
113 return
Guido van Rossumd5754801997-10-06 18:54:25 +0000114 makedirs(head)
115 os.mkdir(dir, 0777)
116
117if __name__ == '__main__':
118 sys.exit(main() or 0)