blob: 31cefb29f0d9e398cd55990752abfd595bc30d53 [file] [log] [blame]
Guido van Rossumd5754801997-10-06 18:54:25 +00001#! /usr/bin/env python
2
3"""A variant on webchecker that creates a mirror copy of a remote site."""
4
5__version__ = "0.1"
6
7import os
8import sys
9import string
10import urllib
11import getopt
12
13import webchecker
14verbose = webchecker.verbose
15
16def main():
17 global verbose
18 try:
19 opts, args = getopt.getopt(sys.argv[1:], "qv")
20 except getopt.error, msg:
21 print msg
22 print "usage:", sys.argv[0], "[-v] ... [rooturl] ..."
23 return 2
24 for o, a in opts:
25 if o == "-q":
26 webchecker.verbose = verbose = 0
27 if o == "-v":
28 webchecker.verbose = verbose = verbose + 1
29 c = Sucker(0)
30 c.urlopener.addheaders = [
31 ('User-agent', 'websucker/%s' % __version__),
32 ]
33 for arg in args:
34 print "Adding root", arg
35 c.addroot(arg)
36 print "Run..."
37 c.run()
38
39class Sucker(webchecker.Checker):
40
41 # Alas, had to copy this to make one change...
42 def getpage(self, url):
43 if url[:7] == 'mailto:' or url[:5] == 'news:':
44 if verbose > 1: print " Not checking mailto/news URL"
45 return None
46 isint = self.inroots(url)
47 if not isint and not self.checkext:
48 if verbose > 1: print " Not checking ext link"
49 return None
50 path = self.savefilename(url)
51 saved = 0
52 try:
53 f = open(path, "rb")
54 except IOError:
55 try:
56 f = self.urlopener.open(url)
57 except IOError, msg:
58 msg = webchecker.sanitize(msg)
59 if verbose > 0:
60 print "Error ", msg
61 if verbose > 0:
62 webchecker.show(" HREF ", url, " from", self.todo[url])
63 self.setbad(url, msg)
64 return None
65 if not isint:
66 if verbose > 1: print " Not gathering links from ext URL"
67 safeclose(f)
68 return None
69 nurl = f.geturl()
70 if nurl != url:
71 path = self.savefilename(nurl)
72 info = f.info()
73 else:
74 if verbose: print "Loading cached URL", url
75 saved = 1
76 nurl = url
77 info = {}
78 if url[-1:] == "/":
79 info["content-type"] = "text/html"
80 text = f.read()
81 if not saved: self.savefile(text, path)
82 if info.has_key('content-type'):
83 ctype = string.lower(info['content-type'])
84 else:
85 ctype = None
86 if nurl != url:
87 if verbose > 1:
88 print " Redirected to", nurl
89 if not ctype:
90 ctype, encoding = webchecker.mimetypes.guess_type(nurl)
91 if ctype != 'text/html':
92 webchecker.safeclose(f)
93 if verbose > 1:
94 print " Not HTML, mime type", ctype
95 return None
96 f.close()
97 return webchecker.Page(text, nurl)
98
99 def savefile(self, text, path):
100 dir, base = os.path.split(path)
101 makedirs(dir)
102 f = open(path, "wb")
103 f.write(text)
104 f.close()
105 print "saved", path
106
107 def savefilename(self, url):
108 type, rest = urllib.splittype(url)
109 host, path = urllib.splithost(rest)
110 while path[:1] == "/": path = path[1:]
111 user, host = urllib.splituser(host)
112 host, port = urllib.splitnport(host)
113 host = string.lower(host)
114 path = os.path.join(host, path)
115 if path[-1] == "/": path = path + "index.html"
116 if os.sep != "/":
117 path = string.join(string.split(path, "/"), os.sep)
118 return path
119
120def makedirs(dir):
121 if not dir or os.path.exists(dir):
122 return
123 head, tail = os.path.split(dir)
124 if not tail:
125 print "Huh? Don't know how to make dir", dir
126 return
127 makedirs(head)
128 os.mkdir(dir, 0777)
129
130if __name__ == '__main__':
131 sys.exit(main() or 0)