blob: 852df07dc41b76e51652c6d73b77b101a126dd94 [file] [log] [blame]
Guido van Rossumd5754801997-10-06 18:54:25 +00001#! /usr/bin/env python
2
3"""A variant on webchecker that creates a mirror copy of a remote site."""
4
Guido van Rossum1a7eae91998-02-21 20:08:39 +00005__version__ = "$Revision$"
Guido van Rossumd5754801997-10-06 18:54:25 +00006
7import os
8import sys
9import string
10import urllib
11import getopt
12
13import webchecker
Guido van Rossum1a7eae91998-02-21 20:08:39 +000014
15# Extract real version number if necessary
16if __version__[0] == '$':
17 _v = string.split(__version__)
18 if len(_v) == 3:
Guido van Rossum986abac1998-04-06 14:29:28 +000019 __version__ = _v[1]
Guido van Rossumd5754801997-10-06 18:54:25 +000020
21def main():
Guido van Rossum1a7eae91998-02-21 20:08:39 +000022 verbose = webchecker.VERBOSE
Guido van Rossumd5754801997-10-06 18:54:25 +000023 try:
Guido van Rossum986abac1998-04-06 14:29:28 +000024 opts, args = getopt.getopt(sys.argv[1:], "qv")
Guido van Rossumd5754801997-10-06 18:54:25 +000025 except getopt.error, msg:
Guido van Rossum986abac1998-04-06 14:29:28 +000026 print msg
27 print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
28 return 2
Guido van Rossumd5754801997-10-06 18:54:25 +000029 for o, a in opts:
Guido van Rossum986abac1998-04-06 14:29:28 +000030 if o == "-q":
31 verbose = 0
32 if o == "-v":
33 verbose = verbose + 1
Guido van Rossum1a7eae91998-02-21 20:08:39 +000034 c = Sucker()
35 c.setflags(verbose=verbose)
Guido van Rossumd5754801997-10-06 18:54:25 +000036 c.urlopener.addheaders = [
Guido van Rossum986abac1998-04-06 14:29:28 +000037 ('User-agent', 'websucker/%s' % __version__),
38 ]
Guido van Rossumd5754801997-10-06 18:54:25 +000039 for arg in args:
Guido van Rossum986abac1998-04-06 14:29:28 +000040 print "Adding root", arg
41 c.addroot(arg)
Guido van Rossumd5754801997-10-06 18:54:25 +000042 print "Run..."
43 c.run()
44
45class Sucker(webchecker.Checker):
46
Guido van Rossum1a7eae91998-02-21 20:08:39 +000047 checkext = 0
48
49 def readhtml(self, url):
Guido van Rossum986abac1998-04-06 14:29:28 +000050 text = None
51 path = self.savefilename(url)
52 try:
53 f = open(path, "rb")
54 except IOError:
55 f = self.openpage(url)
56 if f:
57 info = f.info()
58 nurl = f.geturl()
59 if nurl != url:
60 url = nurl
61 path = self.savefilename(url)
62 text = f.read()
63 f.close()
64 self.savefile(text, path)
65 if not self.checkforhtml(info, url):
66 text = None
67 else:
68 if self.checkforhtml({}, url):
69 text = f.read()
70 f.close()
71 return text, url
Guido van Rossumd5754801997-10-06 18:54:25 +000072
73 def savefile(self, text, path):
Guido van Rossum986abac1998-04-06 14:29:28 +000074 dir, base = os.path.split(path)
75 makedirs(dir)
76 f = open(path, "wb")
77 f.write(text)
78 f.close()
79 print "saved", path
Guido van Rossumd5754801997-10-06 18:54:25 +000080
81 def savefilename(self, url):
Guido van Rossum986abac1998-04-06 14:29:28 +000082 type, rest = urllib.splittype(url)
83 host, path = urllib.splithost(rest)
84 while path[:1] == "/": path = path[1:]
85 user, host = urllib.splituser(host)
86 host, port = urllib.splitnport(host)
87 host = string.lower(host)
88 path = os.path.join(host, path)
89 if path[-1] == "/": path = path + "index.html"
90 if os.sep != "/":
91 path = string.join(string.split(path, "/"), os.sep)
92 return path
Guido van Rossumd5754801997-10-06 18:54:25 +000093
94def makedirs(dir):
95 if not dir or os.path.exists(dir):
Guido van Rossum986abac1998-04-06 14:29:28 +000096 return
Guido van Rossumd5754801997-10-06 18:54:25 +000097 head, tail = os.path.split(dir)
98 if not tail:
Guido van Rossum986abac1998-04-06 14:29:28 +000099 print "Huh? Don't know how to make dir", dir
100 return
Guido van Rossumd5754801997-10-06 18:54:25 +0000101 makedirs(head)
102 os.mkdir(dir, 0777)
103
104if __name__ == '__main__':
105 sys.exit(main() or 0)