blob: 12b697fd7e49218816d83d227cfe7de622f315f8 [file] [log] [blame]
#!/usr/bin/python
# Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""
This script crawls crbug. Sort-of.
Invocation:
Get all bugs with labels, strings (in summary and/or comments):
crbug_crawler.py --labels 'one two three'
--queries '"first query" "second query"'
Get baddest open bugs of all time:
crbug_crawler.py --reap
Tips:
- Label based queries will return faster than text queries.
- contrib/crbug_shell.py is a wrapper that allows you to incrementally
filter search results using this script.
"""
import argparse
import cmd
import logging
import sys
import shlex
import common
from autotest_lib.client.common_lib import global_config
from autotest_lib.server.cros.dynamic_suite import reporting
def _parse_args(args):
if not args:
import crbug_crawler
logging.error('Improper usage of crbug_crawler: %s',
crbug_crawler.__doc__)
sys.exit(1)
description = ('Usage: crbug_crawler.py --reap')
parser = argparse.ArgumentParser(description=description)
parser.add_argument('--quiet', help=('Turn off logging noise.'),
action='store_true', default=False)
parser.add_argument('--num', help='Number of issues to output.', default=10,
type=int)
parser.add_argument('--queries',
help=('Search query. Eg: --queries "%s %s"' %
('build_Root', 'login')),
default='')
parser.add_argument('--labels',
help=('Search labels. Eg: --labels "%s %s"' %
('autofiled', 'Pri-1')), default=None)
parser.add_argument('--reap', help=('Top autofiled bugs ordered by count.'),
action='store_true', default=False)
return parser.parse_args(args)
class Update(object):
"""Class encapsulating fields of an update to a bug.
"""
open_statuses = ['Unconfirmed', 'Untriaged', 'Available', 'Assigned',
'Started', 'ExternalDependency']
closed_statuses = ['Fixed', 'Verified', 'Duplicate', 'WontFix', 'Archived']
def __init__(self, comment='', labels='', status=''):
self.comment = comment
self.labels = labels if labels else []
self.status = status
def __str__(self):
msg = 'status: %s' % self.status
if self.labels:
msg = '%s labels: %s' % (msg, self.labels)
if self.comment:
msg = '%s comment: %s' % (msg, self.comment)
return msg
class UpdateManager(object):
"""Update manager that allows you to revert status updates.
This class keeps track of the last update applied and is capable
of reverting it.
"""
def __init__(self, autocommit=False):
"""Initialize update manager.
@param autocommit: If False just print out the update instead
of committing it.
"""
self.history = {}
self.present = {}
self.reporter = reporting.Reporter()
self.phapi_lib = self.reporter.get_bug_tracker_client()
self.autocommit = autocommit
def revert(self):
"""Only manages status reverts as of now.
"""
for issue_id, update in self.history.iteritems():
logging.warning('You will have to manually update %s and %s on %s',
self.present[issue_id].labels,
self.present[issue_id].comment, issue_id)
# Create a new update with just the status.
self.update(issue_id, Update(status=update.status))
def update(self, old_issue, update):
"""Record the state of an issue before updating it.
@param old_issue: The issue to update. If an id is specified an
issue is constructed. If an issue object (as defined in phapi_lib
Issue)is passed in, it is used directly.
@param update: The Update object to apply to the issue.
"""
if type(old_issue) == int:
old_issue = self.phapi_lib.get_tracker_issue_by_id(old_issue)
old_update = Update(
labels=old_issue.labels, status=old_issue.status)
if not update.status:
update.status = old_update.status
elif (update.status not in Update.open_statuses and
update.status not in Update.closed_statuses):
raise ValueError('Unknown status %s' % update.status)
if not self.autocommit:
logging.warning('Would have applied the following update: '
'%s -> %s', old_update, update)
return
self.history[old_issue.id] = old_update
self.reporter.modify_bug_report(
issue_id=old_issue.id, comment=update.comment,
label_update=update.labels,
status=update.status)
self.present[old_issue.id] = update
class Crawler(object):
"""Class capable of crawling crbug.
This class applies filters to issues it crawls and caches them locally.
"""
# The limit at which we ask for confirmation to proceed with the crawl.
PROMPT_LIMIT = 2000
def __init__(self):
self.reporter = reporting.Reporter()
self.phapi_client = self.reporter.get_bug_tracker_client()
self.issues = None
self.all_autofiled_query = 'ANCHOR TestFailure'
self.all_autofiled_label = 'autofiled'
self.prompted = False
def fuzzy_search(self, query='', label='', fast=True):
"""Returns all issues using one query and/or one label.
@param query: A string representing the query.
@param label: A string representing the label.
@param fast: If true, don't bother fetching comments.
@return: A list of issues matching the query. If fast is
specified the issues won't have comments.
"""
if not query and not label:
raise ValueError('Require query or labels to make a tracker query, '
'try query = "%s" or one of the predefined labels %s' %
(self.fuzzy_search_anchor(),
self.reporter._PREDEFINED_LABELS))
if type(label) != str:
raise ValueError('The crawler only supports one label per query, '
'and it must be a string. you supplied %s' % label)
return self.phapi_client.get_tracker_issues_by_text(
query, label=label, full_text=not fast)
@staticmethod
def _get_autofiled_count(issue):
"""Return the autofiled count.
@param issue: An issue object that has labels.
@return: An integer representing the autofiled count.
"""
for label in issue.labels:
if 'autofiled-count-' in label:
return int(label.replace('autofiled-count-', ''))
# Force bugs without autofiled-count to sink
return 0
def _prompt_crawl(self, new_issues, start_index):
"""Warn the user that a crawl is getting large.
This method prompts for a y/n answer in case the user wants to abort the
crawl and specify another set of labels/queries.
@param new_issues: A list of issues used with the start_index to
determine the number of issues already processed.
@param start_index: The start index of the next crawl iteration.
"""
logging.warning('Found %s issues, Crawling issues starting from %s',
len(new_issues), start_index)
if start_index > self.PROMPT_LIMIT and not self.prompted:
logging.warning('Already crawled %s issues, it is possible that'
'you\'ve specified a very general label. If this is the '
'case consider re-rodering the labels so they start with '
'the rarest. Continue crawling [y/n]?',
start_index + len(new_issues))
self.prompted = raw_input() == 'y'
if not self.prompted:
sys.exit(0)
def exhaustive_crawl(self, query='', label='', fast=True):
"""Perform an exhaustive crawl using one label and query string.
@param query: A string representing one query.
@param lable: A string representing one label.
@return A list of issues sorted by descending autofiled count.
"""
start_index = 0
self.phapi_client.set_max_results(200)
logging.warning('Performing an exhaustive crawl with label %s query %s',
label, query)
vague_issues = []
new_issues = self.fuzzy_search(query=query, label=label, fast=fast)
while new_issues:
vague_issues += new_issues
start_index += len(new_issues) + 1
self.phapi_client.set_start_index(start_index)
new_issues = self.fuzzy_search(query=query, label=label,
fast=fast)
self._prompt_crawl(new_issues, start_index)
# Subsequent calls will clear the issues cache with new results.
self.phapi_client.set_start_index(1)
return sorted(vague_issues, reverse=True,
key=lambda issue: self._get_autofiled_count(issue))
@staticmethod
def filter_labels(issues, labels):
"""Takes a list of labels and returns matching issues.
@param issues: A list of issues to parse for labels.
@param labels: A list of labels to match.
@return: A list of matching issues. The issues must contain
all the labels specified.
"""
if not labels:
return issues
matching_issues = set([])
labels = set(labels)
for issue in issues:
issue_labels = set(issue.labels)
if issue_labels.issuperset(labels):
matching_issues.add(issue)
return matching_issues
@classmethod
def does_query_match(cls, issue, query):
"""Check if a query matches the given issue.
@param issue: The issue to check.
@param query: The query to check against.
@return: True if the query matches, false otherwise.
"""
if query in issue.title or query in issue.summary:
return True
# We can only search comments if the issue is a complete issue
# i.e as defined in phapi_lib.Issue.
try:
if any(query in comment for comment in issue.comments):
return True
except (AttributeError, TypeError):
pass
return False
@classmethod
def filter_queries(cls, issues, queries):
"""Take a list of queries and returns matching issues.
@param issues: A list of issues to parse. If the issues contain
comments and a query is not in the issues title or summmary,
the comments are parsed for a substring match.
@param queries: A list of queries to parse the issues for.
This method looks for an exact substring match within each issue.
@return: A list of matching issues.
"""
if not queries:
return issues
matching_issues = set([])
for issue in issues:
# For each query, check if it's in the title, description or
# comments. If a query isn't in any of these, discard the issue.
for query in queries:
if cls.does_query_match(issue, query):
matching_issues.add(issue)
else:
if issue in matching_issues:
logging.warning('%s: %s\n \tPassed a subset of the '
'queries but failed query %s',
issue.id, issue.title, query)
matching_issues.remove(issue)
break
return matching_issues
def filter_issues(self, queries='', labels=None, fast=True):
"""Run the queries, labels filters by crawling crbug.
@param queries: A space seperated string of queries, usually passed
through the command line.
@param labels: A space seperated string of labels, usually passed
through the command line.
@param fast: If specified, skip creating comments for issues since this
can be a slow process. This value is only a suggestion, since it is
ignored if multiple queries are specified.
"""
queries = shlex.split(queries)
labels = shlex.split(labels) if labels else None
# We'll need comments to filter multiple queries.
if len(queries) > 1:
fast = False
matching_issues = self.exhaustive_crawl(
query=queries.pop(0) if queries else '',
label=labels.pop(0) if labels else '', fast=fast)
matching_issues = self.filter_labels(matching_issues, labels)
matching_issues = self.filter_queries(matching_issues, queries)
self.issues = list(matching_issues)
def dump_issues(self, limit=None):
"""Print issues.
"""
if limit and limit < len(self.issues):
issues = self.issues[:limit]
else:
issues = self.issues
#TODO: Modify formatting, include some paging etc.
for issue in issues:
try:
print ('[%s] %s crbug.com/%s %s' %
(self._get_autofiled_count(issue),
issue.status, issue.id, issue.title))
except UnicodeEncodeError as e:
print "Unicdoe error decoding issue id %s" % issue.id
continue
def _update_test(args):
"""A simple update test, to record usage.
"""
updater = UpdateManager(autocommit=True)
for issue in issues:
updater.update(issue,
Update(comment='this is bogus', labels=['bogus'],
status='Assigned'))
updater.revert()
def configure_logging(quiet=False):
"""Configure logging.
@param quiet: True to turn off warning messages.
"""
logging.basicConfig()
logger = logging.getLogger()
level = logging.WARNING
if quiet:
level = logging.ERROR
logger.setLevel(level)
def main(args):
crawler = Crawler()
if args.reap:
if args.queries or args.labels:
logging.error('Query based ranking of bugs not supported yet.')
return
queries = ''
labels = crawler.all_autofiled_label
else:
queries = args.queries
labels = args.labels
crawler.filter_issues(queries=queries, labels=labels,
fast=False if queries else True)
crawler.dump_issues(int(args.num))
logging.warning('\nThis is a truncated list of %s results, use --num %s '
'to get them all. If you want more informative results/better '
'querying capabilities try crbug_shell.py.',
args.num, len(crawler.issues))
if __name__ == '__main__':
args = _parse_args(sys.argv[1:])
configure_logging(args.quiet)
main(args)