commit-bot@chromium.org | 517c1e2 | 2014-01-22 22:57:19 +0000 | [diff] [blame] | 1 | #!/usr/bin/python2 |
| 2 | |
| 3 | # Copyright 2014 Google Inc. |
| 4 | # |
| 5 | # Use of this source code is governed by a BSD-style license that can be |
| 6 | # found in the LICENSE file. |
| 7 | |
| 8 | """Skia's Chromium Codereview Comparison Script. |
| 9 | |
| 10 | This script takes two Codereview URLs, looks at the trybot results for |
| 11 | the two codereviews and compares the results. |
| 12 | |
| 13 | Usage: |
| 14 | compare_codereview.py CONTROL_URL ROLL_URL |
| 15 | """ |
| 16 | |
| 17 | import collections |
| 18 | import os |
| 19 | import re |
| 20 | import sys |
| 21 | import urllib2 |
| 22 | import HTMLParser |
| 23 | |
| 24 | |
| 25 | class CodeReviewHTMLParser(HTMLParser.HTMLParser): |
| 26 | """Parses CodeReview web page. |
| 27 | |
| 28 | Use the CodeReviewHTMLParser.parse static function to make use of |
| 29 | this class. |
| 30 | |
| 31 | This uses the HTMLParser class because it's the best thing in |
| 32 | Python's standard library. We need a little more power than a |
| 33 | regex. [Search for "You can't parse [X]HTML with regex." for more |
| 34 | information. |
| 35 | """ |
| 36 | # pylint: disable=I0011,R0904 |
| 37 | @staticmethod |
| 38 | def parse(url): |
| 39 | """Parses a CodeReview web pages. |
| 40 | |
| 41 | Args: |
| 42 | url (string), a codereview URL like this: |
| 43 | 'https://codereview.chromium.org/?????????'. |
| 44 | |
| 45 | Returns: |
| 46 | A dictionary; the keys are bot_name strings, the values |
| 47 | are CodeReviewHTMLParser.Status objects |
| 48 | """ |
| 49 | parser = CodeReviewHTMLParser() |
| 50 | try: |
| 51 | parser.feed(urllib2.urlopen(url).read()) |
| 52 | except (urllib2.URLError,): |
| 53 | print >> sys.stderr, 'Error getting', url |
| 54 | return None |
| 55 | parser.close() |
| 56 | return parser.statuses |
| 57 | |
| 58 | # namedtuples are like lightweight structs in Python. The low |
| 59 | # overhead of a tuple, but the ease of use of an object. |
| 60 | Status = collections.namedtuple('Status', ['status', 'url']) |
| 61 | |
| 62 | def __init__(self): |
| 63 | HTMLParser.HTMLParser.__init__(self) |
| 64 | self._id = None |
| 65 | self._status = None |
| 66 | self._href = None |
| 67 | self._anchor_data = '' |
| 68 | self._currently_parsing_trybotdiv = False |
| 69 | # statuses is a dictionary of CodeReviewHTMLParser.Status |
| 70 | self.statuses = {} |
| 71 | |
| 72 | def handle_starttag(self, tag, attrs): |
| 73 | """Overrides the HTMLParser method to implement functionality. |
| 74 | |
| 75 | [[begin standard library documentation]] |
| 76 | This method is called to handle the start of a tag |
| 77 | (e.g. <div id="main">). |
| 78 | |
| 79 | The tag argument is the name of the tag converted to lower |
| 80 | case. The attrs argument is a list of (name, value) pairs |
| 81 | containing the attributes found inside the tag's <> |
| 82 | brackets. The name will be translated to lower case, and |
| 83 | quotes in the value have been removed, and character and |
| 84 | entity references have been replaced. |
| 85 | |
| 86 | For instance, for the tag <A HREF="http://www.cwi.nl/">, this |
| 87 | method would be called as handle_starttag('a', [('href', |
| 88 | 'http://www.cwi.nl/')]). |
| 89 | [[end standard library documentation]] |
| 90 | """ |
| 91 | attrs = dict(attrs) |
| 92 | if tag == 'div': |
| 93 | # We are looking for <div id="tryjobdiv*">. |
| 94 | id_attr = attrs.get('id','') |
| 95 | if id_attr.startswith('tryjobdiv'): |
| 96 | self._id = id_attr |
| 97 | if (self._id and tag == 'a' |
| 98 | and 'build-result' in attrs.get('class', '').split()): |
| 99 | # If we are already inside a <div id="tryjobdiv*">, we |
| 100 | # look for a link if the form |
| 101 | # <a class="build-result" href="*">. Then we save the |
| 102 | # (non-standard) status attribute and the URL. |
| 103 | self._status = attrs.get('status') |
| 104 | self._href = attrs.get('href') |
| 105 | self._currently_parsing_trybotdiv = True |
| 106 | # Start saving anchor data. |
| 107 | |
| 108 | def handle_data(self, data): |
| 109 | """Overrides the HTMLParser method to implement functionality. |
| 110 | |
| 111 | [[begin standard library documentation]] |
| 112 | This method is called to process arbitrary data (e.g. text |
| 113 | nodes and the content of <script>...</script> and |
| 114 | <style>...</style>). |
| 115 | [[end standard library documentation]] |
| 116 | """ |
| 117 | # Save the text inside the <a></a> tags. Assume <a> tags |
| 118 | # aren't nested. |
| 119 | if self._currently_parsing_trybotdiv: |
| 120 | self._anchor_data += data |
| 121 | |
| 122 | def handle_endtag(self, tag): |
| 123 | """Overrides the HTMLParser method to implement functionality. |
| 124 | |
| 125 | [[begin standard library documentation]] |
| 126 | This method is called to handle the end tag of an element |
| 127 | (e.g. </div>). The tag argument is the name of the tag |
| 128 | converted to lower case. |
| 129 | [[end standard library documentation]] |
| 130 | """ |
| 131 | if tag == 'a' and self._status: |
| 132 | # We take the accumulated self._anchor_data and save it as |
| 133 | # the bot name. |
| 134 | bot = self._anchor_data.strip() |
| 135 | stat = CodeReviewHTMLParser.Status(status=self._status, |
| 136 | url=self._href) |
| 137 | if bot: |
| 138 | # Add to accumulating dictionary. |
| 139 | self.statuses[bot] = stat |
| 140 | # Reset state to search for the next bot. |
| 141 | self._currently_parsing_trybotdiv = False |
| 142 | self._anchor_data = '' |
| 143 | self._status = None |
| 144 | self._href = None |
| 145 | |
| 146 | |
| 147 | class BuilderHTMLParser(HTMLParser.HTMLParser): |
| 148 | """parses Trybot web pages. |
| 149 | |
| 150 | Use the BuilderHTMLParser.parse static function to make use of |
| 151 | this class. |
| 152 | |
| 153 | This uses the HTMLParser class because it's the best thing in |
| 154 | Python's standard library. We need a little more power than a |
| 155 | regex. [Search for "You can't parse [X]HTML with regex." for more |
| 156 | information. |
| 157 | """ |
| 158 | # pylint: disable=I0011,R0904 |
| 159 | @staticmethod |
| 160 | def parse(url): |
| 161 | """Parses a Trybot web page. |
| 162 | |
| 163 | Args: |
| 164 | url (string), a trybot result URL. |
| 165 | |
| 166 | Returns: |
| 167 | An array of BuilderHTMLParser.Results, each a description |
| 168 | of failure results, along with an optional url |
| 169 | """ |
| 170 | parser = BuilderHTMLParser() |
| 171 | try: |
| 172 | parser.feed(urllib2.urlopen(url).read()) |
| 173 | except (urllib2.URLError,): |
| 174 | print >> sys.stderr, 'Error getting', url |
| 175 | return [] |
| 176 | parser.close() |
| 177 | return parser.failure_results |
| 178 | |
| 179 | Result = collections.namedtuple('Result', ['text', 'url']) |
| 180 | |
| 181 | def __init__(self): |
| 182 | HTMLParser.HTMLParser.__init__(self) |
| 183 | self.failure_results = [] |
| 184 | self._current_failure_result = None |
| 185 | self._divlevel = None |
| 186 | self._li_level = 0 |
| 187 | self._li_data = '' |
| 188 | self._current_failure = False |
| 189 | self._failure_results_url = '' |
| 190 | |
| 191 | def handle_starttag(self, tag, attrs): |
| 192 | """Overrides the HTMLParser method to implement functionality. |
| 193 | |
| 194 | [[begin standard library documentation]] |
| 195 | This method is called to handle the start of a tag |
| 196 | (e.g. <div id="main">). |
| 197 | |
| 198 | The tag argument is the name of the tag converted to lower |
| 199 | case. The attrs argument is a list of (name, value) pairs |
| 200 | containing the attributes found inside the tag's <> |
| 201 | brackets. The name will be translated to lower case, and |
| 202 | quotes in the value have been removed, and character and |
| 203 | entity references have been replaced. |
| 204 | |
| 205 | For instance, for the tag <A HREF="http://www.cwi.nl/">, this |
| 206 | method would be called as handle_starttag('a', [('href', |
| 207 | 'http://www.cwi.nl/')]). |
| 208 | [[end standard library documentation]] |
| 209 | """ |
| 210 | attrs = dict(attrs) |
| 211 | if tag == 'li': |
| 212 | # <li> tags can be nested. So we have to count the |
| 213 | # nest-level for backing out. |
| 214 | self._li_level += 1 |
| 215 | return |
| 216 | if tag == 'div' and attrs.get('class') == 'failure result': |
| 217 | # We care about this sort of thing: |
| 218 | # <li> |
| 219 | # <li> |
| 220 | # <li> |
| 221 | # <div class="failure result">...</div> |
| 222 | # </li> |
| 223 | # </li> |
| 224 | # We want this text here. |
| 225 | # </li> |
| 226 | if self._li_level > 0: |
| 227 | self._current_failure = True # Tells us to keep text. |
| 228 | return |
| 229 | |
| 230 | if tag == 'a' and self._current_failure: |
| 231 | href = attrs.get('href') |
| 232 | # Sometimes we want to keep the stdio url. We always |
| 233 | # return it, just in case. |
| 234 | if href.endswith('/logs/stdio'): |
| 235 | self._failure_results_url = href |
| 236 | |
| 237 | def handle_data(self, data): |
| 238 | """Overrides the HTMLParser method to implement functionality. |
| 239 | |
| 240 | [[begin standard library documentation]] |
| 241 | This method is called to process arbitrary data (e.g. text |
| 242 | nodes and the content of <script>...</script> and |
| 243 | <style>...</style>). |
| 244 | [[end standard library documentation]] |
| 245 | """ |
| 246 | if self._current_failure: |
| 247 | self._li_data += data |
| 248 | |
| 249 | def handle_endtag(self, tag): |
| 250 | """Overrides the HTMLParser method to implement functionality. |
| 251 | |
| 252 | [[begin standard library documentation]] |
| 253 | This method is called to handle the end tag of an element |
| 254 | (e.g. </div>). The tag argument is the name of the tag |
| 255 | converted to lower case. |
| 256 | [[end standard library documentation]] |
| 257 | """ |
| 258 | if tag == 'li': |
| 259 | self._li_level -= 1 |
| 260 | if 0 == self._li_level: |
| 261 | if self._current_failure: |
| 262 | result = self._li_data.strip() |
| 263 | first = result.split()[0] |
| 264 | if first: |
| 265 | result = re.sub( |
| 266 | r'^%s(\s+%s)+' % (first, first), first, result) |
| 267 | # Sometimes, it repeats the same thing |
| 268 | # multiple times. |
| 269 | result = re.sub(r'unexpected flaky.*', '', result) |
| 270 | # Remove some extra unnecessary text. |
| 271 | result = re.sub(r'\bpreamble\b', '', result) |
| 272 | result = re.sub(r'\bstdio\b', '', result) |
| 273 | url = self._failure_results_url |
| 274 | self.failure_results.append( |
| 275 | BuilderHTMLParser.Result(result, url)) |
| 276 | self._current_failure_result = None |
| 277 | # Reset the state. |
| 278 | self._current_failure = False |
| 279 | self._li_data = '' |
| 280 | self._failure_results_url = '' |
| 281 | |
| 282 | |
| 283 | def printer(indent, string): |
| 284 | """Print indented, wrapped text. |
| 285 | """ |
| 286 | def wrap_to(line, columns): |
| 287 | """Wrap a line to the given number of columns, return a list |
| 288 | of strings. |
| 289 | """ |
| 290 | ret = [] |
| 291 | nextline = '' |
| 292 | for word in line.split(): |
| 293 | if nextline: |
| 294 | if len(nextline) + 1 + len(word) > columns: |
| 295 | ret.append(nextline) |
| 296 | nextline = word |
| 297 | else: |
| 298 | nextline += (' ' + word) |
| 299 | else: |
| 300 | nextline = word |
| 301 | if nextline: |
| 302 | ret.append(nextline) |
| 303 | return ret |
| 304 | out = sys.stdout |
| 305 | spacer = ' ' |
| 306 | for line in string.split('\n'): |
| 307 | for i, wrapped_line in enumerate(wrap_to(line, 68 - (2 * indent))): |
| 308 | out.write(spacer * indent) |
| 309 | if i > 0: |
| 310 | out.write(spacer) |
| 311 | out.write(wrapped_line) |
| 312 | out.write('\n') |
| 313 | out.flush() |
| 314 | |
| 315 | |
| 316 | def main(control_url, roll_url, verbosity=1): |
| 317 | """Compare two Codereview URLs |
| 318 | |
| 319 | Args: |
| 320 | control_url, roll_url: (strings) URL of the format |
| 321 | https://codereview.chromium.org/????????? |
| 322 | |
| 323 | verbosity: (int) verbose level. 0, 1, or 2. |
| 324 | """ |
| 325 | # pylint: disable=I0011,R0914,R0912 |
| 326 | control = CodeReviewHTMLParser.parse(control_url) |
| 327 | roll = CodeReviewHTMLParser.parse(roll_url) |
commit-bot@chromium.org | 788a5f0 | 2014-01-27 16:34:45 +0000 | [diff] [blame] | 328 | all_bots = set(control) & set(roll) # Set intersection. |
| 329 | if not all_bots: |
| 330 | print >> sys.stderr, ( |
| 331 | 'Error: control %s and roll %s have no common trybots.' |
| 332 | % (list(control), list(roll))) |
commit-bot@chromium.org | 517c1e2 | 2014-01-22 22:57:19 +0000 | [diff] [blame] | 333 | return |
| 334 | |
| 335 | control_name = '[control %s]' % control_url.split('/')[-1] |
| 336 | roll_name = '[roll %s]' % roll_url.split('/')[-1] |
commit-bot@chromium.org | 517c1e2 | 2014-01-22 22:57:19 +0000 | [diff] [blame] | 337 | |
| 338 | out = sys.stdout |
| 339 | if verbosity > 0: |
| 340 | # Print out summary of all of the bots. |
| 341 | out.write('%11s %11s %4s %s\n\n' % |
| 342 | ('CONTROL', 'ROLL', 'DIFF', 'BOT')) |
| 343 | for bot in sorted(all_bots): |
| 344 | if control[bot].status != roll[bot].status: |
| 345 | diff = '****' |
| 346 | elif (control[bot].status != 'success' or |
| 347 | roll[bot].status != 'success'): |
| 348 | diff = '....' |
| 349 | else: |
| 350 | diff = '' |
| 351 | out.write('%11s %11s %4s %s\n' % ( |
| 352 | control[bot].status, roll[bot].status, diff, bot)) |
| 353 | out.write('\n') |
| 354 | out.flush() |
| 355 | |
| 356 | for bot in sorted(all_bots): |
| 357 | if (roll[bot].status == 'success'): |
| 358 | if verbosity > 1: |
| 359 | printer(0, '==%s==' % bot) |
| 360 | printer(1, 'OK') |
| 361 | continue |
| 362 | printer(0, '==%s==' % bot) |
| 363 | |
| 364 | for (status, name, url) in ( |
| 365 | (control[bot].status, control_name, control[bot].url), |
| 366 | (roll[bot].status, roll_name, roll[bot].url)): |
| 367 | |
| 368 | if status == 'failure': |
| 369 | printer(1, name) |
| 370 | results = BuilderHTMLParser.parse(url) |
| 371 | for result in results: |
| 372 | formatted_result = re.sub( |
| 373 | r'(\S*\.html) ', '\n__\g<1>\n', result.text) |
| 374 | printer(2, formatted_result) |
| 375 | if ('compile' in result.text |
| 376 | or '...and more' in result.text): |
| 377 | printer(3, re.sub('/[^/]*$', '/', url) + result.url) |
| 378 | else: |
| 379 | printer(1, name) |
| 380 | printer(2, status) |
| 381 | out.write('\n') |
| 382 | |
| 383 | |
| 384 | if __name__ == '__main__': |
| 385 | if len(sys.argv) < 3: |
| 386 | print >> sys.stderr, __doc__ |
| 387 | exit(1) |
| 388 | main(sys.argv[1], sys.argv[2], |
| 389 | int(os.environ.get('COMPARE_CODEREVIEW_VERBOSITY', 1))) |
| 390 | |