blob: ef2fa44d7e08a321a4094b310cebf4cbb1789403 [file] [log] [blame]
Guido van Rossumd5754801997-10-06 18:54:25 +00001#! /usr/bin/env python
2
3"""A variant on webchecker that creates a mirror copy of a remote site."""
4
Guido van Rossum1a7eae91998-02-21 20:08:39 +00005__version__ = "$Revision$"
Guido van Rossumd5754801997-10-06 18:54:25 +00006
7import os
8import sys
Guido van Rossumd5754801997-10-06 18:54:25 +00009import urllib
10import getopt
11
Guido van Rossum497a1981999-11-17 15:40:48 +000012import webchecker
Guido van Rossum1a7eae91998-02-21 20:08:39 +000013
14# Extract real version number if necessary
15if __version__[0] == '$':
Walter Dörwaldaaab30e2002-09-11 20:36:02 +000016 _v = __version__.split()
Guido van Rossum1a7eae91998-02-21 20:08:39 +000017 if len(_v) == 3:
Guido van Rossum986abac1998-04-06 14:29:28 +000018 __version__ = _v[1]
Guido van Rossumd5754801997-10-06 18:54:25 +000019
20def main():
Guido van Rossum1a7eae91998-02-21 20:08:39 +000021 verbose = webchecker.VERBOSE
Guido van Rossumd5754801997-10-06 18:54:25 +000022 try:
Guido van Rossum986abac1998-04-06 14:29:28 +000023 opts, args = getopt.getopt(sys.argv[1:], "qv")
Guido van Rossumd5754801997-10-06 18:54:25 +000024 except getopt.error, msg:
Guido van Rossum986abac1998-04-06 14:29:28 +000025 print msg
26 print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
27 return 2
Guido van Rossumd5754801997-10-06 18:54:25 +000028 for o, a in opts:
Guido van Rossum986abac1998-04-06 14:29:28 +000029 if o == "-q":
30 verbose = 0
31 if o == "-v":
32 verbose = verbose + 1
Guido van Rossum1a7eae91998-02-21 20:08:39 +000033 c = Sucker()
34 c.setflags(verbose=verbose)
Guido van Rossumd5754801997-10-06 18:54:25 +000035 c.urlopener.addheaders = [
Guido van Rossum986abac1998-04-06 14:29:28 +000036 ('User-agent', 'websucker/%s' % __version__),
37 ]
Guido van Rossumd5754801997-10-06 18:54:25 +000038 for arg in args:
Guido van Rossum986abac1998-04-06 14:29:28 +000039 print "Adding root", arg
40 c.addroot(arg)
Guido van Rossumd5754801997-10-06 18:54:25 +000041 print "Run..."
42 c.run()
43
44class Sucker(webchecker.Checker):
45
Guido van Rossum1a7eae91998-02-21 20:08:39 +000046 checkext = 0
Guido van Rossum64acb5c1999-11-17 15:04:26 +000047 nonames = 1
Guido van Rossum1a7eae91998-02-21 20:08:39 +000048
Guido van Rossum64acb5c1999-11-17 15:04:26 +000049 # SAM 11/13/99: in general, URLs are now URL pairs.
50 # Since we've suppressed name anchor checking,
51 # we can ignore the second dimension.
52
53 def readhtml(self, url_pair):
54 url = url_pair[0]
Guido van Rossum986abac1998-04-06 14:29:28 +000055 text = None
56 path = self.savefilename(url)
57 try:
58 f = open(path, "rb")
59 except IOError:
Guido van Rossum64acb5c1999-11-17 15:04:26 +000060 f = self.openpage(url_pair)
Guido van Rossum986abac1998-04-06 14:29:28 +000061 if f:
62 info = f.info()
63 nurl = f.geturl()
64 if nurl != url:
65 url = nurl
66 path = self.savefilename(url)
67 text = f.read()
68 f.close()
69 self.savefile(text, path)
70 if not self.checkforhtml(info, url):
71 text = None
72 else:
73 if self.checkforhtml({}, url):
74 text = f.read()
75 f.close()
76 return text, url
Guido van Rossumd5754801997-10-06 18:54:25 +000077
78 def savefile(self, text, path):
Guido van Rossum986abac1998-04-06 14:29:28 +000079 dir, base = os.path.split(path)
80 makedirs(dir)
Guido van Rossum909bc181999-01-03 13:06:00 +000081 try:
82 f = open(path, "wb")
83 f.write(text)
84 f.close()
85 self.message("saved %s", path)
86 except IOError, msg:
87 self.message("didn't save %s: %s", path, str(msg))
Guido van Rossumd5754801997-10-06 18:54:25 +000088
89 def savefilename(self, url):
Guido van Rossum986abac1998-04-06 14:29:28 +000090 type, rest = urllib.splittype(url)
91 host, path = urllib.splithost(rest)
Walter Dörwaldaaab30e2002-09-11 20:36:02 +000092 path = path.lstrip("/")
Guido van Rossum986abac1998-04-06 14:29:28 +000093 user, host = urllib.splituser(host)
94 host, port = urllib.splitnport(host)
Walter Dörwaldaaab30e2002-09-11 20:36:02 +000095 host = host.lower()
Guido van Rossumd328a9b1998-06-15 12:34:41 +000096 if not path or path[-1] == "/":
Guido van Rossumf3335e12000-04-25 21:13:24 +000097 path = path + "index.html"
Guido van Rossum986abac1998-04-06 14:29:28 +000098 if os.sep != "/":
Walter Dörwaldaaab30e2002-09-11 20:36:02 +000099 path = os.sep.join(path.split("/"))
Guido van Rossumf3335e12000-04-25 21:13:24 +0000100 if os.name == "mac":
101 path = os.sep + path
Guido van Rossumd328a9b1998-06-15 12:34:41 +0000102 path = os.path.join(host, path)
Guido van Rossum986abac1998-04-06 14:29:28 +0000103 return path
Guido van Rossumd5754801997-10-06 18:54:25 +0000104
105def makedirs(dir):
Guido van Rossum909bc181999-01-03 13:06:00 +0000106 if not dir:
107 return
108 if os.path.exists(dir):
109 if not os.path.isdir(dir):
110 try:
111 os.rename(dir, dir + ".bak")
112 os.mkdir(dir)
113 os.rename(dir + ".bak", os.path.join(dir, "index.html"))
114 except os.error:
115 pass
Guido van Rossum986abac1998-04-06 14:29:28 +0000116 return
Guido van Rossumd5754801997-10-06 18:54:25 +0000117 head, tail = os.path.split(dir)
118 if not tail:
Guido van Rossum986abac1998-04-06 14:29:28 +0000119 print "Huh? Don't know how to make dir", dir
120 return
Guido van Rossumd5754801997-10-06 18:54:25 +0000121 makedirs(head)
122 os.mkdir(dir, 0777)
123
124if __name__ == '__main__':
125 sys.exit(main() or 0)