blob: 5f726b36ce8593ff6a0fc17e653b2e9d4e2b38f3 [file] [log] [blame]
Guido van Rossumd5754801997-10-06 18:54:25 +00001#! /usr/bin/env python
2
3"""A variant on webchecker that creates a mirror copy of a remote site."""
4
Guido van Rossum1a7eae91998-02-21 20:08:39 +00005__version__ = "$Revision$"
Guido van Rossumd5754801997-10-06 18:54:25 +00006
7import os
8import sys
9import string
10import urllib
11import getopt
12
Guido van Rossum497a1981999-11-17 15:40:48 +000013import webchecker
Guido van Rossum1a7eae91998-02-21 20:08:39 +000014
15# Extract real version number if necessary
16if __version__[0] == '$':
17 _v = string.split(__version__)
18 if len(_v) == 3:
Guido van Rossum986abac1998-04-06 14:29:28 +000019 __version__ = _v[1]
Guido van Rossumd5754801997-10-06 18:54:25 +000020
21def main():
Guido van Rossum1a7eae91998-02-21 20:08:39 +000022 verbose = webchecker.VERBOSE
Guido van Rossumd5754801997-10-06 18:54:25 +000023 try:
Guido van Rossum986abac1998-04-06 14:29:28 +000024 opts, args = getopt.getopt(sys.argv[1:], "qv")
Guido van Rossumd5754801997-10-06 18:54:25 +000025 except getopt.error, msg:
Guido van Rossum986abac1998-04-06 14:29:28 +000026 print msg
27 print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
28 return 2
Guido van Rossumd5754801997-10-06 18:54:25 +000029 for o, a in opts:
Guido van Rossum986abac1998-04-06 14:29:28 +000030 if o == "-q":
31 verbose = 0
32 if o == "-v":
33 verbose = verbose + 1
Guido van Rossum1a7eae91998-02-21 20:08:39 +000034 c = Sucker()
35 c.setflags(verbose=verbose)
Guido van Rossumd5754801997-10-06 18:54:25 +000036 c.urlopener.addheaders = [
Guido van Rossum986abac1998-04-06 14:29:28 +000037 ('User-agent', 'websucker/%s' % __version__),
38 ]
Guido van Rossumd5754801997-10-06 18:54:25 +000039 for arg in args:
Guido van Rossum986abac1998-04-06 14:29:28 +000040 print "Adding root", arg
41 c.addroot(arg)
Guido van Rossumd5754801997-10-06 18:54:25 +000042 print "Run..."
43 c.run()
44
45class Sucker(webchecker.Checker):
46
Guido van Rossum1a7eae91998-02-21 20:08:39 +000047 checkext = 0
Guido van Rossum64acb5c1999-11-17 15:04:26 +000048 nonames = 1
Guido van Rossum1a7eae91998-02-21 20:08:39 +000049
Guido van Rossum64acb5c1999-11-17 15:04:26 +000050 # SAM 11/13/99: in general, URLs are now URL pairs.
51 # Since we've suppressed name anchor checking,
52 # we can ignore the second dimension.
53
54 def readhtml(self, url_pair):
55 url = url_pair[0]
Guido van Rossum986abac1998-04-06 14:29:28 +000056 text = None
57 path = self.savefilename(url)
58 try:
59 f = open(path, "rb")
60 except IOError:
Guido van Rossum64acb5c1999-11-17 15:04:26 +000061 f = self.openpage(url_pair)
Guido van Rossum986abac1998-04-06 14:29:28 +000062 if f:
63 info = f.info()
64 nurl = f.geturl()
65 if nurl != url:
66 url = nurl
67 path = self.savefilename(url)
68 text = f.read()
69 f.close()
70 self.savefile(text, path)
71 if not self.checkforhtml(info, url):
72 text = None
73 else:
74 if self.checkforhtml({}, url):
75 text = f.read()
76 f.close()
77 return text, url
Guido van Rossumd5754801997-10-06 18:54:25 +000078
79 def savefile(self, text, path):
Guido van Rossum986abac1998-04-06 14:29:28 +000080 dir, base = os.path.split(path)
81 makedirs(dir)
Guido van Rossum909bc181999-01-03 13:06:00 +000082 try:
83 f = open(path, "wb")
84 f.write(text)
85 f.close()
86 self.message("saved %s", path)
87 except IOError, msg:
88 self.message("didn't save %s: %s", path, str(msg))
Guido van Rossumd5754801997-10-06 18:54:25 +000089
90 def savefilename(self, url):
Guido van Rossum986abac1998-04-06 14:29:28 +000091 type, rest = urllib.splittype(url)
92 host, path = urllib.splithost(rest)
93 while path[:1] == "/": path = path[1:]
94 user, host = urllib.splituser(host)
95 host, port = urllib.splitnport(host)
96 host = string.lower(host)
Guido van Rossumd328a9b1998-06-15 12:34:41 +000097 if not path or path[-1] == "/":
Guido van Rossumf3335e12000-04-25 21:13:24 +000098 path = path + "index.html"
Guido van Rossum986abac1998-04-06 14:29:28 +000099 if os.sep != "/":
100 path = string.join(string.split(path, "/"), os.sep)
Guido van Rossumf3335e12000-04-25 21:13:24 +0000101 if os.name == "mac":
102 path = os.sep + path
Guido van Rossumd328a9b1998-06-15 12:34:41 +0000103 path = os.path.join(host, path)
Guido van Rossum986abac1998-04-06 14:29:28 +0000104 return path
Guido van Rossumd5754801997-10-06 18:54:25 +0000105
106def makedirs(dir):
Guido van Rossum909bc181999-01-03 13:06:00 +0000107 if not dir:
108 return
109 if os.path.exists(dir):
110 if not os.path.isdir(dir):
111 try:
112 os.rename(dir, dir + ".bak")
113 os.mkdir(dir)
114 os.rename(dir + ".bak", os.path.join(dir, "index.html"))
115 except os.error:
116 pass
Guido van Rossum986abac1998-04-06 14:29:28 +0000117 return
Guido van Rossumd5754801997-10-06 18:54:25 +0000118 head, tail = os.path.split(dir)
119 if not tail:
Guido van Rossum986abac1998-04-06 14:29:28 +0000120 print "Huh? Don't know how to make dir", dir
121 return
Guido van Rossumd5754801997-10-06 18:54:25 +0000122 makedirs(head)
123 os.mkdir(dir, 0777)
124
125if __name__ == '__main__':
126 sys.exit(main() or 0)