blob: 745385935588be05a0adca4187f36ff53c47ed57 [file] [log] [blame]
Guido van Rossumd5754801997-10-06 18:54:25 +00001#! /usr/bin/env python
2
3"""A variant on webchecker that creates a mirror copy of a remote site."""
4
Guido van Rossum1a7eae91998-02-21 20:08:39 +00005__version__ = "$Revision$"
Guido van Rossumd5754801997-10-06 18:54:25 +00006
7import os
8import sys
9import string
10import urllib
11import getopt
12
Guido van Rossum64acb5c1999-11-17 15:04:26 +000013import wcnew
14
15webchecker = wcnew
Guido van Rossum1a7eae91998-02-21 20:08:39 +000016
17# Extract real version number if necessary
18if __version__[0] == '$':
19 _v = string.split(__version__)
20 if len(_v) == 3:
Guido van Rossum986abac1998-04-06 14:29:28 +000021 __version__ = _v[1]
Guido van Rossumd5754801997-10-06 18:54:25 +000022
23def main():
Guido van Rossum1a7eae91998-02-21 20:08:39 +000024 verbose = webchecker.VERBOSE
Guido van Rossumd5754801997-10-06 18:54:25 +000025 try:
Guido van Rossum986abac1998-04-06 14:29:28 +000026 opts, args = getopt.getopt(sys.argv[1:], "qv")
Guido van Rossumd5754801997-10-06 18:54:25 +000027 except getopt.error, msg:
Guido van Rossum986abac1998-04-06 14:29:28 +000028 print msg
29 print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
30 return 2
Guido van Rossumd5754801997-10-06 18:54:25 +000031 for o, a in opts:
Guido van Rossum986abac1998-04-06 14:29:28 +000032 if o == "-q":
33 verbose = 0
34 if o == "-v":
35 verbose = verbose + 1
Guido van Rossum1a7eae91998-02-21 20:08:39 +000036 c = Sucker()
37 c.setflags(verbose=verbose)
Guido van Rossumd5754801997-10-06 18:54:25 +000038 c.urlopener.addheaders = [
Guido van Rossum986abac1998-04-06 14:29:28 +000039 ('User-agent', 'websucker/%s' % __version__),
40 ]
Guido van Rossumd5754801997-10-06 18:54:25 +000041 for arg in args:
Guido van Rossum986abac1998-04-06 14:29:28 +000042 print "Adding root", arg
43 c.addroot(arg)
Guido van Rossumd5754801997-10-06 18:54:25 +000044 print "Run..."
45 c.run()
46
47class Sucker(webchecker.Checker):
48
Guido van Rossum1a7eae91998-02-21 20:08:39 +000049 checkext = 0
Guido van Rossum64acb5c1999-11-17 15:04:26 +000050 nonames = 1
Guido van Rossum1a7eae91998-02-21 20:08:39 +000051
Guido van Rossum64acb5c1999-11-17 15:04:26 +000052 # SAM 11/13/99: in general, URLs are now URL pairs.
53 # Since we've suppressed name anchor checking,
54 # we can ignore the second dimension.
55
56 def readhtml(self, url_pair):
57 url = url_pair[0]
Guido van Rossum986abac1998-04-06 14:29:28 +000058 text = None
59 path = self.savefilename(url)
60 try:
61 f = open(path, "rb")
62 except IOError:
Guido van Rossum64acb5c1999-11-17 15:04:26 +000063 f = self.openpage(url_pair)
Guido van Rossum986abac1998-04-06 14:29:28 +000064 if f:
65 info = f.info()
66 nurl = f.geturl()
67 if nurl != url:
68 url = nurl
69 path = self.savefilename(url)
70 text = f.read()
71 f.close()
72 self.savefile(text, path)
73 if not self.checkforhtml(info, url):
74 text = None
75 else:
76 if self.checkforhtml({}, url):
77 text = f.read()
78 f.close()
79 return text, url
Guido van Rossumd5754801997-10-06 18:54:25 +000080
81 def savefile(self, text, path):
Guido van Rossum986abac1998-04-06 14:29:28 +000082 dir, base = os.path.split(path)
83 makedirs(dir)
Guido van Rossum909bc181999-01-03 13:06:00 +000084 try:
85 f = open(path, "wb")
86 f.write(text)
87 f.close()
88 self.message("saved %s", path)
89 except IOError, msg:
90 self.message("didn't save %s: %s", path, str(msg))
Guido van Rossumd5754801997-10-06 18:54:25 +000091
92 def savefilename(self, url):
Guido van Rossum986abac1998-04-06 14:29:28 +000093 type, rest = urllib.splittype(url)
94 host, path = urllib.splithost(rest)
95 while path[:1] == "/": path = path[1:]
96 user, host = urllib.splituser(host)
97 host, port = urllib.splitnport(host)
98 host = string.lower(host)
Guido van Rossumd328a9b1998-06-15 12:34:41 +000099 if not path or path[-1] == "/":
Guido van Rossum64acb5c1999-11-17 15:04:26 +0000100 path = path + "index.html"
Guido van Rossum986abac1998-04-06 14:29:28 +0000101 if os.sep != "/":
102 path = string.join(string.split(path, "/"), os.sep)
Guido van Rossumd328a9b1998-06-15 12:34:41 +0000103 path = os.path.join(host, path)
Guido van Rossum986abac1998-04-06 14:29:28 +0000104 return path
Guido van Rossumd5754801997-10-06 18:54:25 +0000105
106def makedirs(dir):
Guido van Rossum909bc181999-01-03 13:06:00 +0000107 if not dir:
108 return
109 if os.path.exists(dir):
110 if not os.path.isdir(dir):
111 try:
112 os.rename(dir, dir + ".bak")
113 os.mkdir(dir)
114 os.rename(dir + ".bak", os.path.join(dir, "index.html"))
115 except os.error:
116 pass
Guido van Rossum986abac1998-04-06 14:29:28 +0000117 return
Guido van Rossumd5754801997-10-06 18:54:25 +0000118 head, tail = os.path.split(dir)
119 if not tail:
Guido van Rossum986abac1998-04-06 14:29:28 +0000120 print "Huh? Don't know how to make dir", dir
121 return
Guido van Rossumd5754801997-10-06 18:54:25 +0000122 makedirs(head)
123 os.mkdir(dir, 0777)
124
125if __name__ == '__main__':
126 sys.exit(main() or 0)