mbligh | b62f724 | 2009-07-29 14:34:30 +0000 | [diff] [blame] | 1 | # Copyright 2009 Google Inc. Released under the GPL v2 |
| 2 | |
| 3 | import re, time, urllib2, urlparse, HTMLParser |
| 4 | |
| 5 | from autotest_lib.mirror import database |
| 6 | from autotest_lib.client.common_lib import utils |
| 7 | |
| 8 | |
| 9 | class source(object): |
| 10 | """ |
| 11 | Abstract Base Class for the source classes. |
| 12 | """ |
| 13 | def __init__(self, database): |
| 14 | self.database = database |
| 15 | |
| 16 | |
| 17 | def _get_new_files(self, files): |
| 18 | """ |
| 19 | Return a copy of "files" after filtering out known old files |
| 20 | from "files". |
| 21 | """ |
| 22 | old_files = self.database.get_dictionary() |
| 23 | return dict(filter(lambda x: x[0] not in old_files, files.iteritems())) |
| 24 | |
| 25 | |
| 26 | def get_new_files(self): |
| 27 | raise NotImplemented('get_new_files not implemented') |
| 28 | |
| 29 | |
| 30 | def store_files(self, files): |
| 31 | self.database.merge_dictionary(files) |
| 32 | |
| 33 | |
| 34 | class rsync_source(source): |
| 35 | _cmd_template = '/usr/bin/rsync -rltz --no-motd %s %s/%s' |
| 36 | |
| 37 | def __init__(self, database, prefix, excludes = []): |
| 38 | super(rsync_source, self).__init__(database) |
| 39 | |
| 40 | self.prefix = prefix |
| 41 | self.exclude = ' '.join(['--exclude "' + x + '"' for x in excludes]) |
| 42 | self.sources = [] |
| 43 | |
| 44 | |
| 45 | def _parse_output(self, output, prefix): |
| 46 | """ |
| 47 | Parse rsync's "ls -l" style output and return a dictionary of |
| 48 | database.item indexed by the "name" field. |
| 49 | """ |
| 50 | regex = re.compile( |
| 51 | '-[rwx-]{9} +(\d+) (\d{4}/\d\d/\d\d \d\d:\d\d:\d\d) (.*)') |
| 52 | res = {} |
| 53 | for line in output.splitlines(): |
| 54 | match = regex.match(line) |
| 55 | if match: |
| 56 | groups = match.groups() |
| 57 | timestamp = time.mktime(time.strptime(groups[1], |
| 58 | '%Y/%m/%d %H:%M:%S')) |
| 59 | if prefix: |
| 60 | fname = '%s/%s' % (prefix, groups[2]) |
| 61 | else: |
| 62 | fname = groups[2] |
| 63 | |
| 64 | item = database.item(fname, int(groups[0]), int(timestamp)) |
| 65 | res[item.name] = item |
| 66 | |
| 67 | return res |
| 68 | |
| 69 | |
| 70 | def add_path(self, src, prefix=''): |
| 71 | """ |
| 72 | Add paths to synchronize from the source. |
| 73 | """ |
| 74 | self.sources.append((src, prefix)) |
| 75 | |
| 76 | |
| 77 | def get_new_files(self): |
| 78 | """ |
| 79 | Implement source.get_new_files by using rsync listing feature. |
| 80 | """ |
| 81 | files = {} |
| 82 | for src, prefix in self.sources: |
| 83 | output = utils.system_output(self._cmd_template % |
| 84 | (self.exclude, self.prefix, src)) |
| 85 | files.update(self._parse_output(output, prefix)) |
| 86 | |
| 87 | return self._get_new_files(files) |
| 88 | |
| 89 | |
| 90 | class _ahref_parser(HTMLParser.HTMLParser): |
| 91 | def reset(self, url=None, pattern=None): |
| 92 | HTMLParser.HTMLParser.reset(self) |
| 93 | self.url = url |
| 94 | self.pattern = pattern |
| 95 | self.links = [] |
| 96 | |
| 97 | |
| 98 | def handle_starttag(self, tag, attrs): |
| 99 | if tag == 'a': |
| 100 | for name, value in attrs: |
| 101 | if name == 'href': |
| 102 | # compose absolute URL if relative "href" found |
| 103 | url = urlparse.urljoin(self.url, value) |
| 104 | if self.pattern.match(url): |
| 105 | self.links.append(url) |
| 106 | |
| 107 | |
| 108 | def get_ahref_list(self, url, pattern): |
| 109 | self.reset(url, pattern) |
| 110 | self.feed(urllib2.urlopen(url).read()) |
| 111 | self.close() |
| 112 | |
| 113 | return self.links |
| 114 | |
| 115 | |
| 116 | class url_source(source): |
| 117 | """ |
| 118 | A simple URL based source that parses HTML to find references to |
| 119 | kernel files. |
| 120 | """ |
| 121 | _extension_pattern = re.compile(r'.*\.[^/.]+$') |
| 122 | |
| 123 | def __init__(self, database, prefix): |
| 124 | super(url_source, self).__init__(database) |
| 125 | self.prefix = prefix |
| 126 | self.urls = [] |
| 127 | |
| 128 | |
| 129 | def add_url(self, url, pattern): |
| 130 | """ |
| 131 | Add a URL path to a HTML document with links to kernel files. |
| 132 | |
| 133 | @param url: URL path to a HTML file with links to kernel files |
| 134 | (can be either an absolute URL or one relative to self.prefix) |
| 135 | @param pattern: regex pattern to filter kernel files links out of |
| 136 | all othe links found in the HTML document |
| 137 | """ |
| 138 | # if it does not have an extension then it's a directory and it needs |
| 139 | # a trailing '/'. NOTE: there are some false positives such as |
| 140 | # directories named "v2.6" where ".6" will be assumed to be extension. |
| 141 | # In order for these to work the caller must provide a trailing / |
| 142 | if url[-1:] != '/' and not self._extension_pattern.match(url): |
| 143 | url = url + '/' |
| 144 | self.urls.append((url, re.compile(pattern))) |
| 145 | |
| 146 | |
| 147 | @staticmethod |
| 148 | def _get_item(url): |
| 149 | """ |
| 150 | Get a database.item object by fetching relevant HTTP information |
| 151 | from the document pointed to by the given url. |
| 152 | """ |
| 153 | try: |
| 154 | info = urllib2.urlopen(url).info() |
| 155 | except IOError, err: |
| 156 | # file is referenced but does not exist |
| 157 | print 'WARNING: %s' % err |
| 158 | return None |
| 159 | |
| 160 | size = info.get('content-length') |
| 161 | if size: |
| 162 | size = int(size) |
| 163 | else: |
| 164 | size = -1 |
| 165 | |
| 166 | timestamp = int(time.mktime(info.getdate('date'))) |
| 167 | if not timestamp: |
| 168 | timestamp = 0 |
| 169 | |
| 170 | return database.item(url, size, timestamp) |
| 171 | |
| 172 | |
| 173 | def get_new_files(self): |
| 174 | parser = _ahref_parser() |
| 175 | |
| 176 | files = {} |
| 177 | for url, pattern in self.urls: |
| 178 | links = parser.get_ahref_list(urlparse.urljoin(self.prefix, url), |
| 179 | pattern) |
| 180 | for link in links: |
| 181 | item = self._get_item(link) |
| 182 | if item: |
| 183 | files[item.name] = item |
| 184 | |
| 185 | return self._get_new_files(files) |