Blame - mirror/source.py - platform/external/autotest

blob: d8f6f3e41dbead6540cb7faacd6b861ff5defa71 [file] [log] [blame]

mbligh	b62f724	2009-07-29 14:34:30 +0000	[diff] [blame]	1	# Copyright 2009 Google Inc. Released under the GPL v2
				2
				3	import re, time, urllib2, urlparse, HTMLParser
				4
				5	from autotest_lib.mirror import database
				6	from autotest_lib.client.common_lib import utils
				7
				8
				9	class source(object):
				10	"""
				11	Abstract Base Class for the source classes.
				12	"""
				13	def __init__(self, database):
				14	self.database = database
				15
				16
				17	def _get_new_files(self, files):
				18	"""
				19	Return a copy of "files" after filtering out known old files
				20	from "files".
				21	"""
				22	old_files = self.database.get_dictionary()
				23	return dict(filter(lambda x: x[0] not in old_files, files.iteritems()))
				24
				25
				26	def get_new_files(self):
				27	raise NotImplemented('get_new_files not implemented')
				28
				29
				30	def store_files(self, files):
				31	self.database.merge_dictionary(files)
				32
				33
				34	class rsync_source(source):
				35	_cmd_template = '/usr/bin/rsync -rltz --no-motd %s %s/%s'
				36
				37	def __init__(self, database, prefix, excludes = []):
				38	super(rsync_source, self).__init__(database)
				39
				40	self.prefix = prefix
				41	self.exclude = ' '.join(['--exclude "' + x + '"' for x in excludes])
				42	self.sources = []
				43
				44
				45	def _parse_output(self, output, prefix):
				46	"""
				47	Parse rsync's "ls -l" style output and return a dictionary of
				48	database.item indexed by the "name" field.
				49	"""
				50	regex = re.compile(
				51	'-[rwx-]{9} +(\d+) (\d{4}/\d\d/\d\d \d\d:\d\d:\d\d) (.*)')
				52	res = {}
				53	for line in output.splitlines():
				54	match = regex.match(line)
				55	if match:
				56	groups = match.groups()
				57	timestamp = time.mktime(time.strptime(groups[1],
				58	'%Y/%m/%d %H:%M:%S'))
				59	if prefix:
				60	fname = '%s/%s' % (prefix, groups[2])
				61	else:
				62	fname = groups[2]
				63
				64	item = database.item(fname, int(groups[0]), int(timestamp))
				65	res[item.name] = item
				66
				67	return res
				68
				69
				70	def add_path(self, src, prefix=''):
				71	"""
				72	Add paths to synchronize from the source.
				73	"""
				74	self.sources.append((src, prefix))
				75
				76
				77	def get_new_files(self):
				78	"""
				79	Implement source.get_new_files by using rsync listing feature.
				80	"""
				81	files = {}
				82	for src, prefix in self.sources:
				83	output = utils.system_output(self._cmd_template %
				84	(self.exclude, self.prefix, src))
				85	files.update(self._parse_output(output, prefix))
				86
				87	return self._get_new_files(files)
				88
				89
				90	class _ahref_parser(HTMLParser.HTMLParser):
				91	def reset(self, url=None, pattern=None):
				92	HTMLParser.HTMLParser.reset(self)
				93	self.url = url
				94	self.pattern = pattern
				95	self.links = []
				96
				97
				98	def handle_starttag(self, tag, attrs):
				99	if tag == 'a':
				100	for name, value in attrs:
				101	if name == 'href':
				102	# compose absolute URL if relative "href" found
				103	url = urlparse.urljoin(self.url, value)
				104	if self.pattern.match(url):
				105	self.links.append(url)
				106
				107
				108	def get_ahref_list(self, url, pattern):
				109	self.reset(url, pattern)
				110	self.feed(urllib2.urlopen(url).read())
				111	self.close()
				112
				113	return self.links
				114
				115
				116	class url_source(source):
				117	"""
				118	A simple URL based source that parses HTML to find references to
				119	kernel files.
				120	"""
				121	_extension_pattern = re.compile(r'.*\.[^/.]+$')
				122
				123	def __init__(self, database, prefix):
				124	super(url_source, self).__init__(database)
				125	self.prefix = prefix
				126	self.urls = []
				127
				128
				129	def add_url(self, url, pattern):
				130	"""
				131	Add a URL path to a HTML document with links to kernel files.
				132
				133	@param url: URL path to a HTML file with links to kernel files
				134	(can be either an absolute URL or one relative to self.prefix)
				135	@param pattern: regex pattern to filter kernel files links out of
				136	all othe links found in the HTML document
				137	"""
				138	# if it does not have an extension then it's a directory and it needs
				139	# a trailing '/'. NOTE: there are some false positives such as
				140	# directories named "v2.6" where ".6" will be assumed to be extension.
				141	# In order for these to work the caller must provide a trailing /
				142	if url[-1:] != '/' and not self._extension_pattern.match(url):
				143	url = url + '/'
				144	self.urls.append((url, re.compile(pattern)))
				145
				146
				147	@staticmethod
				148	def _get_item(url):
				149	"""
				150	Get a database.item object by fetching relevant HTTP information
				151	from the document pointed to by the given url.
				152	"""
				153	try:
				154	info = urllib2.urlopen(url).info()
				155	except IOError, err:
				156	# file is referenced but does not exist
				157	print 'WARNING: %s' % err
				158	return None
				159
				160	size = info.get('content-length')
				161	if size:
				162	size = int(size)
				163	else:
				164	size = -1
				165
				166	timestamp = int(time.mktime(info.getdate('date')))
				167	if not timestamp:
				168	timestamp = 0
				169
				170	return database.item(url, size, timestamp)
				171
				172
				173	def get_new_files(self):
				174	parser = _ahref_parser()
				175
				176	files = {}
				177	for url, pattern in self.urls:
				178	links = parser.get_ahref_list(urlparse.urljoin(self.prefix, url),
				179	pattern)
				180	for link in links:
				181	item = self._get_item(link)
				182	if item:
				183	files[item.name] = item
				184
				185	return self._get_new_files(files)