| #!/usr/bin/python |
| |
| # Copyright (c) 2014 The Chromium OS Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """ |
| This script crawls crbug. Sort-of. |
| Invocation: |
| Get all bugs with labels, strings (in summary and/or comments): |
| crbug_crawler.py --labels 'one two three' |
| --queries '"first query" "second query"' |
| |
| Get baddest open bugs of all time: |
| crbug_crawler.py --reap |
| |
| Tips: |
| - Label based queries will return faster than text queries. |
| - contrib/crbug_shell.py is a wrapper that allows you to incrementally |
| filter search results using this script. |
| """ |
| |
| import argparse |
| import cmd |
| import logging |
| import sys |
| import shlex |
| |
| import common |
| from autotest_lib.client.common_lib import global_config |
| from autotest_lib.server.cros.dynamic_suite import reporting |
| |
| |
| def _parse_args(args): |
| if not args: |
| import crbug_crawler |
| logging.error('Improper usage of crbug_crawler: %s', |
| crbug_crawler.__doc__) |
| sys.exit(1) |
| |
| description = ('Usage: crbug_crawler.py --reap') |
| parser = argparse.ArgumentParser(description=description) |
| parser.add_argument('--quiet', help=('Turn off logging noise.'), |
| action='store_true', default=False) |
| parser.add_argument('--num', help='Number of issues to output.', default=10, |
| type=int) |
| parser.add_argument('--queries', |
| help=('Search query. Eg: --queries "%s %s"' % |
| ('build_Root', 'login')), |
| default='') |
| parser.add_argument('--labels', |
| help=('Search labels. Eg: --labels "%s %s"' % |
| ('autofiled', 'Pri-1')), default=None) |
| parser.add_argument('--reap', help=('Top autofiled bugs ordered by count.'), |
| action='store_true', default=False) |
| return parser.parse_args(args) |
| |
| |
| class Update(object): |
| """Class encapsulating fields of an update to a bug. |
| """ |
| open_statuses = ['Unconfirmed', 'Untriaged', 'Available', 'Assigned', |
| 'Started', 'ExternalDependency'] |
| closed_statuses = ['Fixed', 'Verified', 'Duplicate', 'WontFix', 'Archived'] |
| |
| def __init__(self, comment='', labels='', status=''): |
| self.comment = comment |
| self.labels = labels if labels else [] |
| self.status = status |
| |
| |
| def __str__(self): |
| msg = 'status: %s' % self.status |
| if self.labels: |
| msg = '%s labels: %s' % (msg, self.labels) |
| if self.comment: |
| msg = '%s comment: %s' % (msg, self.comment) |
| return msg |
| |
| |
| class UpdateManager(object): |
| """Update manager that allows you to revert status updates. |
| |
| This class keeps track of the last update applied and is capable |
| of reverting it. |
| """ |
| |
| def __init__(self, autocommit=False): |
| """Initialize update manager. |
| |
| @param autocommit: If False just print out the update instead |
| of committing it. |
| """ |
| self.history = {} |
| self.present = {} |
| self.reporter = reporting.Reporter() |
| self.phapi_lib = self.reporter.get_bug_tracker_client() |
| self.autocommit = autocommit |
| |
| |
| def revert(self): |
| """Only manages status reverts as of now. |
| """ |
| for issue_id, update in self.history.iteritems(): |
| logging.warning('You will have to manually update %s and %s on %s', |
| self.present[issue_id].labels, |
| self.present[issue_id].comment, issue_id) |
| # Create a new update with just the status. |
| self.update(issue_id, Update(status=update.status)) |
| |
| |
| def update(self, old_issue, update): |
| """Record the state of an issue before updating it. |
| |
| @param old_issue: The issue to update. If an id is specified an |
| issue is constructed. If an issue object (as defined in phapi_lib |
| Issue)is passed in, it is used directly. |
| @param update: The Update object to apply to the issue. |
| """ |
| if type(old_issue) == int: |
| old_issue = self.phapi_lib.get_tracker_issue_by_id(old_issue) |
| old_update = Update( |
| labels=old_issue.labels, status=old_issue.status) |
| |
| if not update.status: |
| update.status = old_update.status |
| elif (update.status not in Update.open_statuses and |
| update.status not in Update.closed_statuses): |
| raise ValueError('Unknown status %s' % update.status) |
| |
| if not self.autocommit: |
| logging.warning('Would have applied the following update: ' |
| '%s -> %s', old_update, update) |
| return |
| |
| self.history[old_issue.id] = old_update |
| self.reporter.modify_bug_report( |
| issue_id=old_issue.id, comment=update.comment, |
| label_update=update.labels, |
| status=update.status) |
| self.present[old_issue.id] = update |
| |
| |
| class Crawler(object): |
| """Class capable of crawling crbug. |
| |
| This class applies filters to issues it crawls and caches them locally. |
| """ |
| |
| # The limit at which we ask for confirmation to proceed with the crawl. |
| PROMPT_LIMIT = 2000 |
| |
| def __init__(self): |
| self.reporter = reporting.Reporter() |
| self.phapi_client = self.reporter.get_bug_tracker_client() |
| self.issues = None |
| self.all_autofiled_query = 'ANCHOR TestFailure' |
| self.all_autofiled_label = 'autofiled' |
| self.prompted = False |
| |
| |
| def fuzzy_search(self, query='', label='', fast=True): |
| """Returns all issues using one query and/or one label. |
| |
| @param query: A string representing the query. |
| @param label: A string representing the label. |
| @param fast: If true, don't bother fetching comments. |
| |
| @return: A list of issues matching the query. If fast is |
| specified the issues won't have comments. |
| """ |
| if not query and not label: |
| raise ValueError('Require query or labels to make a tracker query, ' |
| 'try query = "%s" or one of the predefined labels %s' % |
| (self.fuzzy_search_anchor(), |
| self.reporter._PREDEFINED_LABELS)) |
| if type(label) != str: |
| raise ValueError('The crawler only supports one label per query, ' |
| 'and it must be a string. you supplied %s' % label) |
| return self.phapi_client.get_tracker_issues_by_text( |
| query, label=label, full_text=not fast) |
| |
| |
| @staticmethod |
| def _get_autofiled_count(issue): |
| """Return the autofiled count. |
| |
| @param issue: An issue object that has labels. |
| |
| @return: An integer representing the autofiled count. |
| """ |
| for label in issue.labels: |
| if 'autofiled-count-' in label: |
| return int(label.replace('autofiled-count-', '')) |
| |
| # Force bugs without autofiled-count to sink |
| return 0 |
| |
| |
| def _prompt_crawl(self, new_issues, start_index): |
| """Warn the user that a crawl is getting large. |
| |
| This method prompts for a y/n answer in case the user wants to abort the |
| crawl and specify another set of labels/queries. |
| |
| @param new_issues: A list of issues used with the start_index to |
| determine the number of issues already processed. |
| @param start_index: The start index of the next crawl iteration. |
| """ |
| logging.warning('Found %s issues, Crawling issues starting from %s', |
| len(new_issues), start_index) |
| if start_index > self.PROMPT_LIMIT and not self.prompted: |
| logging.warning('Already crawled %s issues, it is possible that' |
| 'you\'ve specified a very general label. If this is the ' |
| 'case consider re-rodering the labels so they start with ' |
| 'the rarest. Continue crawling [y/n]?', |
| start_index + len(new_issues)) |
| self.prompted = raw_input() == 'y' |
| if not self.prompted: |
| sys.exit(0) |
| |
| |
| def exhaustive_crawl(self, query='', label='', fast=True): |
| """Perform an exhaustive crawl using one label and query string. |
| |
| @param query: A string representing one query. |
| @param lable: A string representing one label. |
| |
| @return A list of issues sorted by descending autofiled count. |
| """ |
| start_index = 0 |
| self.phapi_client.set_max_results(200) |
| logging.warning('Performing an exhaustive crawl with label %s query %s', |
| label, query) |
| vague_issues = [] |
| new_issues = self.fuzzy_search(query=query, label=label, fast=fast) |
| while new_issues: |
| vague_issues += new_issues |
| start_index += len(new_issues) + 1 |
| self.phapi_client.set_start_index(start_index) |
| new_issues = self.fuzzy_search(query=query, label=label, |
| fast=fast) |
| self._prompt_crawl(new_issues, start_index) |
| |
| # Subsequent calls will clear the issues cache with new results. |
| self.phapi_client.set_start_index(1) |
| return sorted(vague_issues, reverse=True, |
| key=lambda issue: self._get_autofiled_count(issue)) |
| |
| |
| @staticmethod |
| def filter_labels(issues, labels): |
| """Takes a list of labels and returns matching issues. |
| |
| @param issues: A list of issues to parse for labels. |
| @param labels: A list of labels to match. |
| |
| @return: A list of matching issues. The issues must contain |
| all the labels specified. |
| """ |
| if not labels: |
| return issues |
| matching_issues = set([]) |
| labels = set(labels) |
| for issue in issues: |
| issue_labels = set(issue.labels) |
| if issue_labels.issuperset(labels): |
| matching_issues.add(issue) |
| return matching_issues |
| |
| |
| @classmethod |
| def does_query_match(cls, issue, query): |
| """Check if a query matches the given issue. |
| |
| @param issue: The issue to check. |
| @param query: The query to check against. |
| |
| @return: True if the query matches, false otherwise. |
| """ |
| if query in issue.title or query in issue.summary: |
| return True |
| # We can only search comments if the issue is a complete issue |
| # i.e as defined in phapi_lib.Issue. |
| try: |
| if any(query in comment for comment in issue.comments): |
| return True |
| except (AttributeError, TypeError): |
| pass |
| return False |
| |
| |
| @classmethod |
| def filter_queries(cls, issues, queries): |
| """Take a list of queries and returns matching issues. |
| |
| @param issues: A list of issues to parse. If the issues contain |
| comments and a query is not in the issues title or summmary, |
| the comments are parsed for a substring match. |
| @param queries: A list of queries to parse the issues for. |
| This method looks for an exact substring match within each issue. |
| |
| @return: A list of matching issues. |
| """ |
| if not queries: |
| return issues |
| matching_issues = set([]) |
| for issue in issues: |
| # For each query, check if it's in the title, description or |
| # comments. If a query isn't in any of these, discard the issue. |
| for query in queries: |
| if cls.does_query_match(issue, query): |
| matching_issues.add(issue) |
| else: |
| if issue in matching_issues: |
| logging.warning('%s: %s\n \tPassed a subset of the ' |
| 'queries but failed query %s', |
| issue.id, issue.title, query) |
| matching_issues.remove(issue) |
| break |
| return matching_issues |
| |
| |
| def filter_issues(self, queries='', labels=None, fast=True): |
| """Run the queries, labels filters by crawling crbug. |
| |
| @param queries: A space seperated string of queries, usually passed |
| through the command line. |
| @param labels: A space seperated string of labels, usually passed |
| through the command line. |
| @param fast: If specified, skip creating comments for issues since this |
| can be a slow process. This value is only a suggestion, since it is |
| ignored if multiple queries are specified. |
| """ |
| queries = shlex.split(queries) |
| labels = shlex.split(labels) if labels else None |
| |
| # We'll need comments to filter multiple queries. |
| if len(queries) > 1: |
| fast = False |
| matching_issues = self.exhaustive_crawl( |
| query=queries.pop(0) if queries else '', |
| label=labels.pop(0) if labels else '', fast=fast) |
| matching_issues = self.filter_labels(matching_issues, labels) |
| matching_issues = self.filter_queries(matching_issues, queries) |
| self.issues = list(matching_issues) |
| |
| |
| def dump_issues(self, limit=None): |
| """Print issues. |
| """ |
| if limit and limit < len(self.issues): |
| issues = self.issues[:limit] |
| else: |
| issues = self.issues |
| #TODO: Modify formatting, include some paging etc. |
| for issue in issues: |
| try: |
| print ('[%s] %s crbug.com/%s %s' % |
| (self._get_autofiled_count(issue), |
| issue.status, issue.id, issue.title)) |
| except UnicodeEncodeError as e: |
| print "Unicdoe error decoding issue id %s" % issue.id |
| continue |
| |
| |
| def _update_test(args): |
| """A simple update test, to record usage. |
| """ |
| updater = UpdateManager(autocommit=True) |
| for issue in issues: |
| updater.update(issue, |
| Update(comment='this is bogus', labels=['bogus'], |
| status='Assigned')) |
| updater.revert() |
| |
| |
| def configure_logging(quiet=False): |
| """Configure logging. |
| |
| @param quiet: True to turn off warning messages. |
| """ |
| logging.basicConfig() |
| logger = logging.getLogger() |
| level = logging.WARNING |
| if quiet: |
| level = logging.ERROR |
| logger.setLevel(level) |
| |
| |
| def main(args): |
| crawler = Crawler() |
| if args.reap: |
| if args.queries or args.labels: |
| logging.error('Query based ranking of bugs not supported yet.') |
| return |
| queries = '' |
| labels = crawler.all_autofiled_label |
| else: |
| queries = args.queries |
| labels = args.labels |
| crawler.filter_issues(queries=queries, labels=labels, |
| fast=False if queries else True) |
| crawler.dump_issues(int(args.num)) |
| logging.warning('\nThis is a truncated list of %s results, use --num %s ' |
| 'to get them all. If you want more informative results/better ' |
| 'querying capabilities try crbug_shell.py.', |
| args.num, len(crawler.issues)) |
| |
| |
| if __name__ == '__main__': |
| args = _parse_args(sys.argv[1:]) |
| configure_logging(args.quiet) |
| main(args) |
| |