blob: d8f6f3e41dbead6540cb7faacd6b861ff5defa71 [file] [log] [blame]
mblighb62f7242009-07-29 14:34:30 +00001# Copyright 2009 Google Inc. Released under the GPL v2
2
3import re, time, urllib2, urlparse, HTMLParser
4
5from autotest_lib.mirror import database
6from autotest_lib.client.common_lib import utils
7
8
9class source(object):
10 """
11 Abstract Base Class for the source classes.
12 """
13 def __init__(self, database):
14 self.database = database
15
16
17 def _get_new_files(self, files):
18 """
19 Return a copy of "files" after filtering out known old files
20 from "files".
21 """
22 old_files = self.database.get_dictionary()
23 return dict(filter(lambda x: x[0] not in old_files, files.iteritems()))
24
25
26 def get_new_files(self):
27 raise NotImplemented('get_new_files not implemented')
28
29
30 def store_files(self, files):
31 self.database.merge_dictionary(files)
32
33
34class rsync_source(source):
35 _cmd_template = '/usr/bin/rsync -rltz --no-motd %s %s/%s'
36
37 def __init__(self, database, prefix, excludes = []):
38 super(rsync_source, self).__init__(database)
39
40 self.prefix = prefix
41 self.exclude = ' '.join(['--exclude "' + x + '"' for x in excludes])
42 self.sources = []
43
44
45 def _parse_output(self, output, prefix):
46 """
47 Parse rsync's "ls -l" style output and return a dictionary of
48 database.item indexed by the "name" field.
49 """
50 regex = re.compile(
51 '-[rwx-]{9} +(\d+) (\d{4}/\d\d/\d\d \d\d:\d\d:\d\d) (.*)')
52 res = {}
53 for line in output.splitlines():
54 match = regex.match(line)
55 if match:
56 groups = match.groups()
57 timestamp = time.mktime(time.strptime(groups[1],
58 '%Y/%m/%d %H:%M:%S'))
59 if prefix:
60 fname = '%s/%s' % (prefix, groups[2])
61 else:
62 fname = groups[2]
63
64 item = database.item(fname, int(groups[0]), int(timestamp))
65 res[item.name] = item
66
67 return res
68
69
70 def add_path(self, src, prefix=''):
71 """
72 Add paths to synchronize from the source.
73 """
74 self.sources.append((src, prefix))
75
76
77 def get_new_files(self):
78 """
79 Implement source.get_new_files by using rsync listing feature.
80 """
81 files = {}
82 for src, prefix in self.sources:
83 output = utils.system_output(self._cmd_template %
84 (self.exclude, self.prefix, src))
85 files.update(self._parse_output(output, prefix))
86
87 return self._get_new_files(files)
88
89
90class _ahref_parser(HTMLParser.HTMLParser):
91 def reset(self, url=None, pattern=None):
92 HTMLParser.HTMLParser.reset(self)
93 self.url = url
94 self.pattern = pattern
95 self.links = []
96
97
98 def handle_starttag(self, tag, attrs):
99 if tag == 'a':
100 for name, value in attrs:
101 if name == 'href':
102 # compose absolute URL if relative "href" found
103 url = urlparse.urljoin(self.url, value)
104 if self.pattern.match(url):
105 self.links.append(url)
106
107
108 def get_ahref_list(self, url, pattern):
109 self.reset(url, pattern)
110 self.feed(urllib2.urlopen(url).read())
111 self.close()
112
113 return self.links
114
115
116class url_source(source):
117 """
118 A simple URL based source that parses HTML to find references to
119 kernel files.
120 """
121 _extension_pattern = re.compile(r'.*\.[^/.]+$')
122
123 def __init__(self, database, prefix):
124 super(url_source, self).__init__(database)
125 self.prefix = prefix
126 self.urls = []
127
128
129 def add_url(self, url, pattern):
130 """
131 Add a URL path to a HTML document with links to kernel files.
132
133 @param url: URL path to a HTML file with links to kernel files
134 (can be either an absolute URL or one relative to self.prefix)
135 @param pattern: regex pattern to filter kernel files links out of
136 all othe links found in the HTML document
137 """
138 # if it does not have an extension then it's a directory and it needs
139 # a trailing '/'. NOTE: there are some false positives such as
140 # directories named "v2.6" where ".6" will be assumed to be extension.
141 # In order for these to work the caller must provide a trailing /
142 if url[-1:] != '/' and not self._extension_pattern.match(url):
143 url = url + '/'
144 self.urls.append((url, re.compile(pattern)))
145
146
147 @staticmethod
148 def _get_item(url):
149 """
150 Get a database.item object by fetching relevant HTTP information
151 from the document pointed to by the given url.
152 """
153 try:
154 info = urllib2.urlopen(url).info()
155 except IOError, err:
156 # file is referenced but does not exist
157 print 'WARNING: %s' % err
158 return None
159
160 size = info.get('content-length')
161 if size:
162 size = int(size)
163 else:
164 size = -1
165
166 timestamp = int(time.mktime(info.getdate('date')))
167 if not timestamp:
168 timestamp = 0
169
170 return database.item(url, size, timestamp)
171
172
173 def get_new_files(self):
174 parser = _ahref_parser()
175
176 files = {}
177 for url, pattern in self.urls:
178 links = parser.get_ahref_list(urlparse.urljoin(self.prefix, url),
179 pattern)
180 for link in links:
181 item = self._get_item(link)
182 if item:
183 files[item.name] = item
184
185 return self._get_new_files(files)