| #!/usr/bin/python2 |
| |
| # Copyright 2014 Google Inc. |
| # |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Skia's Chromium Codereview Comparison Script. |
| |
| This script takes two Codereview URLs, looks at the trybot results for |
| the two codereviews and compares the results. |
| |
| Usage: |
| compare_codereview.py CONTROL_URL ROLL_URL |
| """ |
| |
| import collections |
| import os |
| import re |
| import sys |
| import urllib2 |
| import HTMLParser |
| |
| |
| class CodeReviewHTMLParser(HTMLParser.HTMLParser): |
| """Parses CodeReview web page. |
| |
| Use the CodeReviewHTMLParser.parse static function to make use of |
| this class. |
| |
| This uses the HTMLParser class because it's the best thing in |
| Python's standard library. We need a little more power than a |
| regex. [Search for "You can't parse [X]HTML with regex." for more |
| information. |
| """ |
| # pylint: disable=I0011,R0904 |
| @staticmethod |
| def parse(url): |
| """Parses a CodeReview web pages. |
| |
| Args: |
| url (string), a codereview URL like this: |
| 'https://codereview.chromium.org/?????????'. |
| |
| Returns: |
| A dictionary; the keys are bot_name strings, the values |
| are CodeReviewHTMLParser.Status objects |
| """ |
| parser = CodeReviewHTMLParser() |
| try: |
| parser.feed(urllib2.urlopen(url).read()) |
| except (urllib2.URLError,): |
| print >> sys.stderr, 'Error getting', url |
| return None |
| parser.close() |
| return parser.statuses |
| |
| # namedtuples are like lightweight structs in Python. The low |
| # overhead of a tuple, but the ease of use of an object. |
| Status = collections.namedtuple('Status', ['status', 'url']) |
| |
| def __init__(self): |
| HTMLParser.HTMLParser.__init__(self) |
| self._id = None |
| self._status = None |
| self._href = None |
| self._anchor_data = '' |
| self._currently_parsing_trybotdiv = False |
| # statuses is a dictionary of CodeReviewHTMLParser.Status |
| self.statuses = {} |
| |
| def handle_starttag(self, tag, attrs): |
| """Overrides the HTMLParser method to implement functionality. |
| |
| [[begin standard library documentation]] |
| This method is called to handle the start of a tag |
| (e.g. <div id="main">). |
| |
| The tag argument is the name of the tag converted to lower |
| case. The attrs argument is a list of (name, value) pairs |
| containing the attributes found inside the tag's <> |
| brackets. The name will be translated to lower case, and |
| quotes in the value have been removed, and character and |
| entity references have been replaced. |
| |
| For instance, for the tag <A HREF="http://www.cwi.nl/">, this |
| method would be called as handle_starttag('a', [('href', |
| 'http://www.cwi.nl/')]). |
| [[end standard library documentation]] |
| """ |
| attrs = dict(attrs) |
| if tag == 'div': |
| # We are looking for <div id="tryjobdiv*">. |
| id_attr = attrs.get('id','') |
| if id_attr.startswith('tryjobdiv'): |
| self._id = id_attr |
| if (self._id and tag == 'a' |
| and 'build-result' in attrs.get('class', '').split()): |
| # If we are already inside a <div id="tryjobdiv*">, we |
| # look for a link if the form |
| # <a class="build-result" href="*">. Then we save the |
| # (non-standard) status attribute and the URL. |
| self._status = attrs.get('status') |
| self._href = attrs.get('href') |
| self._currently_parsing_trybotdiv = True |
| # Start saving anchor data. |
| |
| def handle_data(self, data): |
| """Overrides the HTMLParser method to implement functionality. |
| |
| [[begin standard library documentation]] |
| This method is called to process arbitrary data (e.g. text |
| nodes and the content of <script>...</script> and |
| <style>...</style>). |
| [[end standard library documentation]] |
| """ |
| # Save the text inside the <a></a> tags. Assume <a> tags |
| # aren't nested. |
| if self._currently_parsing_trybotdiv: |
| self._anchor_data += data |
| |
| def handle_endtag(self, tag): |
| """Overrides the HTMLParser method to implement functionality. |
| |
| [[begin standard library documentation]] |
| This method is called to handle the end tag of an element |
| (e.g. </div>). The tag argument is the name of the tag |
| converted to lower case. |
| [[end standard library documentation]] |
| """ |
| if tag == 'a' and self._status: |
| # We take the accumulated self._anchor_data and save it as |
| # the bot name. |
| bot = self._anchor_data.strip() |
| stat = CodeReviewHTMLParser.Status(status=self._status, |
| url=self._href) |
| if bot: |
| # Add to accumulating dictionary. |
| self.statuses[bot] = stat |
| # Reset state to search for the next bot. |
| self._currently_parsing_trybotdiv = False |
| self._anchor_data = '' |
| self._status = None |
| self._href = None |
| |
| |
| class BuilderHTMLParser(HTMLParser.HTMLParser): |
| """parses Trybot web pages. |
| |
| Use the BuilderHTMLParser.parse static function to make use of |
| this class. |
| |
| This uses the HTMLParser class because it's the best thing in |
| Python's standard library. We need a little more power than a |
| regex. [Search for "You can't parse [X]HTML with regex." for more |
| information. |
| """ |
| # pylint: disable=I0011,R0904 |
| @staticmethod |
| def parse(url): |
| """Parses a Trybot web page. |
| |
| Args: |
| url (string), a trybot result URL. |
| |
| Returns: |
| An array of BuilderHTMLParser.Results, each a description |
| of failure results, along with an optional url |
| """ |
| parser = BuilderHTMLParser() |
| try: |
| parser.feed(urllib2.urlopen(url).read()) |
| except (urllib2.URLError,): |
| print >> sys.stderr, 'Error getting', url |
| return [] |
| parser.close() |
| return parser.failure_results |
| |
| Result = collections.namedtuple('Result', ['text', 'url']) |
| |
| def __init__(self): |
| HTMLParser.HTMLParser.__init__(self) |
| self.failure_results = [] |
| self._current_failure_result = None |
| self._divlevel = None |
| self._li_level = 0 |
| self._li_data = '' |
| self._current_failure = False |
| self._failure_results_url = '' |
| |
| def handle_starttag(self, tag, attrs): |
| """Overrides the HTMLParser method to implement functionality. |
| |
| [[begin standard library documentation]] |
| This method is called to handle the start of a tag |
| (e.g. <div id="main">). |
| |
| The tag argument is the name of the tag converted to lower |
| case. The attrs argument is a list of (name, value) pairs |
| containing the attributes found inside the tag's <> |
| brackets. The name will be translated to lower case, and |
| quotes in the value have been removed, and character and |
| entity references have been replaced. |
| |
| For instance, for the tag <A HREF="http://www.cwi.nl/">, this |
| method would be called as handle_starttag('a', [('href', |
| 'http://www.cwi.nl/')]). |
| [[end standard library documentation]] |
| """ |
| attrs = dict(attrs) |
| if tag == 'li': |
| # <li> tags can be nested. So we have to count the |
| # nest-level for backing out. |
| self._li_level += 1 |
| return |
| if tag == 'div' and attrs.get('class') == 'failure result': |
| # We care about this sort of thing: |
| # <li> |
| # <li> |
| # <li> |
| # <div class="failure result">...</div> |
| # </li> |
| # </li> |
| # We want this text here. |
| # </li> |
| if self._li_level > 0: |
| self._current_failure = True # Tells us to keep text. |
| return |
| |
| if tag == 'a' and self._current_failure: |
| href = attrs.get('href') |
| # Sometimes we want to keep the stdio url. We always |
| # return it, just in case. |
| if href.endswith('/logs/stdio'): |
| self._failure_results_url = href |
| |
| def handle_data(self, data): |
| """Overrides the HTMLParser method to implement functionality. |
| |
| [[begin standard library documentation]] |
| This method is called to process arbitrary data (e.g. text |
| nodes and the content of <script>...</script> and |
| <style>...</style>). |
| [[end standard library documentation]] |
| """ |
| if self._current_failure: |
| self._li_data += data |
| |
| def handle_endtag(self, tag): |
| """Overrides the HTMLParser method to implement functionality. |
| |
| [[begin standard library documentation]] |
| This method is called to handle the end tag of an element |
| (e.g. </div>). The tag argument is the name of the tag |
| converted to lower case. |
| [[end standard library documentation]] |
| """ |
| if tag == 'li': |
| self._li_level -= 1 |
| if 0 == self._li_level: |
| if self._current_failure: |
| result = self._li_data.strip() |
| first = result.split()[0] |
| if first: |
| result = re.sub( |
| r'^%s(\s+%s)+' % (first, first), first, result) |
| # Sometimes, it repeats the same thing |
| # multiple times. |
| result = re.sub(r'unexpected flaky.*', '', result) |
| # Remove some extra unnecessary text. |
| result = re.sub(r'\bpreamble\b', '', result) |
| result = re.sub(r'\bstdio\b', '', result) |
| url = self._failure_results_url |
| self.failure_results.append( |
| BuilderHTMLParser.Result(result, url)) |
| self._current_failure_result = None |
| # Reset the state. |
| self._current_failure = False |
| self._li_data = '' |
| self._failure_results_url = '' |
| |
| |
| def printer(indent, string): |
| """Print indented, wrapped text. |
| """ |
| def wrap_to(line, columns): |
| """Wrap a line to the given number of columns, return a list |
| of strings. |
| """ |
| ret = [] |
| nextline = '' |
| for word in line.split(): |
| if nextline: |
| if len(nextline) + 1 + len(word) > columns: |
| ret.append(nextline) |
| nextline = word |
| else: |
| nextline += (' ' + word) |
| else: |
| nextline = word |
| if nextline: |
| ret.append(nextline) |
| return ret |
| out = sys.stdout |
| spacer = ' ' |
| for line in string.split('\n'): |
| for i, wrapped_line in enumerate(wrap_to(line, 68 - (2 * indent))): |
| out.write(spacer * indent) |
| if i > 0: |
| out.write(spacer) |
| out.write(wrapped_line) |
| out.write('\n') |
| out.flush() |
| |
| |
| def main(control_url, roll_url, verbosity=1): |
| """Compare two Codereview URLs |
| |
| Args: |
| control_url, roll_url: (strings) URL of the format |
| https://codereview.chromium.org/????????? |
| |
| verbosity: (int) verbose level. 0, 1, or 2. |
| """ |
| # pylint: disable=I0011,R0914,R0912 |
| control = CodeReviewHTMLParser.parse(control_url) |
| roll = CodeReviewHTMLParser.parse(roll_url) |
| all_bots = set(control) & set(roll) # Set intersection. |
| if not all_bots: |
| print >> sys.stderr, ( |
| 'Error: control %s and roll %s have no common trybots.' |
| % (list(control), list(roll))) |
| return |
| |
| control_name = '[control %s]' % control_url.split('/')[-1] |
| roll_name = '[roll %s]' % roll_url.split('/')[-1] |
| |
| out = sys.stdout |
| if verbosity > 0: |
| # Print out summary of all of the bots. |
| out.write('%11s %11s %4s %s\n\n' % |
| ('CONTROL', 'ROLL', 'DIFF', 'BOT')) |
| for bot in sorted(all_bots): |
| if control[bot].status != roll[bot].status: |
| diff = '****' |
| elif (control[bot].status != 'success' or |
| roll[bot].status != 'success'): |
| diff = '....' |
| else: |
| diff = '' |
| out.write('%11s %11s %4s %s\n' % ( |
| control[bot].status, roll[bot].status, diff, bot)) |
| out.write('\n') |
| out.flush() |
| |
| for bot in sorted(all_bots): |
| if (roll[bot].status == 'success'): |
| if verbosity > 1: |
| printer(0, '==%s==' % bot) |
| printer(1, 'OK') |
| continue |
| printer(0, '==%s==' % bot) |
| |
| for (status, name, url) in ( |
| (control[bot].status, control_name, control[bot].url), |
| (roll[bot].status, roll_name, roll[bot].url)): |
| |
| if status == 'failure': |
| printer(1, name) |
| results = BuilderHTMLParser.parse(url) |
| for result in results: |
| formatted_result = re.sub( |
| r'(\S*\.html) ', '\n__\g<1>\n', result.text) |
| printer(2, formatted_result) |
| if ('compile' in result.text |
| or '...and more' in result.text): |
| printer(3, re.sub('/[^/]*$', '/', url) + result.url) |
| else: |
| printer(1, name) |
| printer(2, status) |
| out.write('\n') |
| |
| |
| if __name__ == '__main__': |
| if len(sys.argv) < 3: |
| print >> sys.stderr, __doc__ |
| exit(1) |
| main(sys.argv[1], sys.argv[2], |
| int(os.environ.get('COMPARE_CODEREVIEW_VERBOSITY', 1))) |
| |