Perf extraction/parsing scripts for new ChromeOS performance dashboard.

This CL contains 2 python scripts that: (1) extract perf keys from the
autotest DB; and (2) creates perf data files from the extracted data
that can be read in to display chrome-style perf graphs.

BUG=chromium-os:30753
TEST=Verified the scripts work on my local machine and on cautotest, and
that chrome-style graphs can be displayed from the created data files.

Change-Id: I7ef85c7aba62d00f9c226a1ce33904bf6788dc78
Reviewed-on: https://gerrit.chromium.org/gerrit/37871
Commit-Queue: Dennis Jeffrey <dennisjeffrey@chromium.org>
Reviewed-by: Dennis Jeffrey <dennisjeffrey@chromium.org>
Tested-by: Dennis Jeffrey <dennisjeffrey@chromium.org>
diff --git a/frontend/perf-dashboard/extract_perf.py b/frontend/perf-dashboard/extract_perf.py
new file mode 100644
index 0000000..9cd66a1
--- /dev/null
+++ b/frontend/perf-dashboard/extract_perf.py
@@ -0,0 +1,316 @@
+#!/usr/bin/env python
+# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Extracts perf keys from autotest database and writes to local data files.
+
+This script keeps track of the job IDs for which perf keys have already been
+extracted on previous runs.  The script only processes those job IDs whose perf
+values haven't yet been previously extracted.
+
+Sample usage:
+    python extract_perf.py -v
+
+Run with -h to see the full set of command-line options.
+"""
+
+import datetime
+import logging
+import optparse
+import os
+import re
+import simplejson
+import sys
+import time
+
+import common
+from autotest_lib.client.common_lib import global_config
+from autotest_lib.server import frontend
+
+try:
+    from google.storage.speckle.python.api import rdbms_googleapi
+except ImportError:
+    # Download the AppEngine SDK if desired from here:
+    # https://developers.google.com/appengine/downloads
+    rdbms_googleapi = None
+
+try:
+    import MySQLdb
+except ImportError:
+    MySQLdb = None
+
+_GLOBAL_CONF = global_config.global_config
+_CONF_SECTION = 'AUTOTEST_WEB'
+
+_MYSQL_READONLY_LOGIN_CREDENTIALS = {
+    'host': _GLOBAL_CONF.get_config_value(_CONF_SECTION, 'readonly_host'),
+    'user': _GLOBAL_CONF.get_config_value(_CONF_SECTION, 'readonly_user'),
+    'passwd': _GLOBAL_CONF.get_config_value(_CONF_SECTION, 'readonly_password'),
+    'db': _GLOBAL_CONF.get_config_value(_CONF_SECTION, 'database'),
+}
+
+# Paths to files.
+_ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+_CHART_CONFIG_FILE = os.path.join(_ROOT_DIR, 'croschart_defaults.json')
+_DEST_DATA_DIR = os.path.join(_ROOT_DIR, 'data')
+_COMPLETED_ID_FILE = os.path.join(_DEST_DATA_DIR, 'job_id_complete.txt')
+_CURR_PID_FILE = os.path.join(_DEST_DATA_DIR, __file__ + '.curr_pid.txt')
+
+# Values that can be configured through options.
+_NUM_DAYS_BACK = 7  # Ignore perf test runs in database that finished more than
+                    # this many days ago.
+
+# Other values that can only be configured here in the code.
+_AFE = frontend.AFE()
+_PLATFORMS = map(lambda x: x.name, _AFE.get_labels(platform=True))
+if 'snow' in _PLATFORMS:
+    _PLATFORMS.remove('snow')
+    _PLATFORMS.append('daisy')
+
+
+def get_job_ids(cursor, test_name, oldest_db_lookup_date, completed_ids):
+    """Gets all job IDs for the given test name that haven't yet been processed.
+
+    @param cursor: see query_database().
+    @param test_name: see query_database().
+    @param oldest_db_lookup_date: see query_database().
+    @param completed_ids: see query_database().
+
+    @return A list of string job IDs from the database that should be processed.
+    """
+    query = ('SELECT DISTINCT afe_job_id '
+             'FROM tko_perf_view_2 INNER JOIN tko_jobs USING (job_idx) '
+             'INNER JOIN tko_status USING (status_idx) WHERE '
+             'test_name = %s AND test_finished_time >= %s AND '
+             'word != "RUNNING"')
+    start_time = time.time()
+    cursor.execute(query, (test_name, oldest_db_lookup_date))
+    logging.debug('Extracted job IDs in %.2f seconds', time.time() - start_time)
+    job_ids = []
+    for result_row in cursor:
+        job_id = str(result_row[0])
+        if job_id not in completed_ids:
+            job_ids.append(job_id)
+    return job_ids
+
+
+def write_perf_info_to_disk(job_id, result_dict, test_dir):
+    """Writes extracted perf data for the given job ID to disk.
+
+    Also writes the job ID to disk to mark it as having been processed.  Note
+    that the written files are not protected against simultaneous access by
+    multiple invocations of this script.
+
+    @param job_id: The string job ID.
+    @param result_dict: A dictionary of associated perf info to write to disk.
+    @param test_dir: The string directory name in which to write perf data.
+    """
+    result_out = [job_id, result_dict['job_name'], result_dict['platform']]
+    perf_items = []
+    for perf_key in result_dict['perf_keys']:
+        for perf_val in result_dict['perf_keys'][perf_key]:
+            perf_items.append((perf_key, perf_val))
+    result_out.append(perf_items)
+    file_name = os.path.join(test_dir, result_dict['platform'] + '.txt')
+    with open(file_name, 'a') as fp:
+        fp.write(simplejson.dumps(result_out) + '\n')
+
+    with open(_COMPLETED_ID_FILE, 'a') as fp:
+        fp.write(job_id + '\n')
+
+
+def extract_perf_for_job_id(cursor, job_id, unexpected_job_names, test_dir):
+    """Extracts perf data for a given job, then writes to local text files.
+
+    @param cursor: A MySQLdb.cursor object used for interacting with a database.
+    @param job_id: The string job ID to process.
+    @param unexpected_job_names: A set of job names encountered so far that are
+        not associated with a known platform type.
+    @param test_dir: The string directory name in which to write perf data.
+
+    @return True, if data for the specified job ID is written to disk, or
+        False if not (will be False if the job ID is not associated with a known
+        platform type).
+    """
+    query = ('SELECT job_name,iteration_key,iteration_value '
+             'FROM tko_perf_view_2 INNER JOIN tko_jobs USING (job_idx) '
+             'WHERE afe_job_id = %s')
+    cursor.execute(query, job_id)
+
+    result = {}
+    for job_name, key, val in cursor:
+        # The job_name string contains the platform name. The platform name is
+        # always followed by either "-rX", where X is the milestone number
+        # (this is from legacy data in the database), or else it is followed
+        # by "-release" (for more recent data in the database).  We do not
+        # consider jobs in which the platform name is followed by anything
+        # else (in particular, "-paladin" runs).
+        #
+        # TODO(dennisjeffrey): Simplify the below code once the following bug
+        # is addressed to standardize the platform names: crosbug.com/38521.
+        match = re.search('(\w+)-r', job_name)
+        # Only process jobs for known platforms.
+        platform = match.group(1) if match.group(1) in _PLATFORMS else None
+        if platform:
+            result['job_name'] = job_name
+            result['platform'] = platform
+            result.setdefault('perf_keys', {})
+            if key and val:
+                result['perf_keys'].setdefault(key, [])
+                result['perf_keys'][key].append(val)
+        else:
+            unexpected_job_names.add(job_name)
+
+    if 'platform' not in result:
+        return False
+
+    write_perf_info_to_disk(job_id, result, test_dir)
+    return True
+
+
+def query_database(cursor, test_name, completed_ids, oldest_db_lookup_date):
+    """Queries database for perf values and stores them into local text files.
+
+    This function performs the work only for the specified test case.
+
+    @param cursor: A MySQLdb.cursor object used for interacting with a database.
+    @param test_name: The string name of a test case to process.
+    @param completed_ids: A set of job IDs that have already been previously
+        extracted from the database.
+    @param oldest_db_lookup_date: The oldest date (represented as a string) for
+        which we want to consider perf values in the database.
+
+    @return The number of new job IDs that have been extracted/processed.
+    """
+    test_dir = os.path.join(_DEST_DATA_DIR, test_name)
+    if not os.path.isdir(test_dir):
+        os.makedirs(test_dir)
+
+    # Identify the job IDs that need to be processed.
+    job_ids = get_job_ids(cursor, test_name, oldest_db_lookup_date,
+                          completed_ids)
+
+    # For each job ID, extract the perf values we need.
+    unexpected_job_names = set()
+    num_newly_added = 0
+    for i, job_id in enumerate(job_ids):
+        logging.debug('Processing job %d of %d', i + 1, len(job_ids))
+
+        if extract_perf_for_job_id(cursor, job_id, unexpected_job_names,
+                                   test_dir):
+            completed_ids.add(job_id)
+            num_newly_added += 1
+
+    if unexpected_job_names:
+        logging.debug('Job names skipped due to unexpected platform: %s',
+                      list(unexpected_job_names))
+
+    return num_newly_added
+
+
+def extract_new_perf_data(cursor, options):
+    """Extracts new perf data from database and writes data to local text files.
+
+    @param cursor: A MySQLdb.cursor object used for interacting with a database.
+    @param options: An optparse.OptionParser options object.
+
+    @return The number of new job IDs that have been extracted/processed.
+    """
+    charts = {}
+    with open(_CHART_CONFIG_FILE, 'r') as fp:
+        charts = simplejson.loads(fp.read())
+
+    # Compute the oldest date for the perf values that we want to consider.
+    oldest_db_lookup_date = (
+        datetime.date.today() -
+        datetime.timedelta(days=options.num_days_back)).strftime('%Y-%m-%d')
+
+    logging.debug('Extracting job IDs from %s onwards.',
+                  oldest_db_lookup_date)
+
+    # Get unique test names.
+    test_names = set([c['test_name'] for c in charts])
+
+    # Get list of already-completed job IDs so we don't re-fetch their data.
+    completed_ids = set()
+    if os.path.isfile(_COMPLETED_ID_FILE):
+        with open(_COMPLETED_ID_FILE, 'r') as fp:
+            job_ids = map(lambda x: x.strip(), fp.readlines())
+            for job_id in job_ids:
+                completed_ids.add(job_id)
+
+    num_newly_added = 0
+    for i, test_name in enumerate(test_names):
+        logging.info('Extracting info for test %d of %d: %s ', i + 1,
+                     len(test_names), test_name)
+
+        num_newly_added += query_database(cursor, test_name, completed_ids,
+                                          oldest_db_lookup_date)
+
+    return num_newly_added
+
+
+def cleanup():
+    """Cleans up when this script is done."""
+    if os.path.isfile(_CURR_PID_FILE):
+        os.remove(_CURR_PID_FILE)
+
+
+def main():
+    """Main function."""
+    parser = optparse.OptionParser()
+    parser.add_option('-n', '--num-days-back', metavar='NUM_DAYS', type='int',
+                      default=_NUM_DAYS_BACK,
+                      help='Consider only the perf test results that were '
+                           'computed within this many days ago (if this script '
+                           'is invoked daily, no need to consider history from '
+                           'many days back). Defaults to %default days back.')
+    parser.add_option('-c', '--cloud-sql', action='store_true', default=False,
+                      help='Connect to the chromeos-lab CloudSQL database, '
+                           'rather than the original MySQL autotest database.')
+    parser.add_option('-v', '--verbose', action='store_true', default=False,
+                      help='Use verbose logging.')
+    options, _ = parser.parse_args()
+
+    log_level = logging.DEBUG if options.verbose else logging.INFO
+    logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
+                        level=log_level)
+
+    cursor = None
+    if options.cloud_sql:
+        # CloudSQL.
+        logging.debug('Connecting to CloudSQL...')
+        if rdbms_googleapi is None:
+            logging.error('CloudSQL requested, but cannot locate CloudSQL '
+                          'dependencies. Have you set up CloudSQL on this '
+                          'machine?')
+            sys.exit(1)
+        db = rdbms_googleapi.connect(None,
+                                     instance='chromeos-bot:chromeos-lab')
+        cursor = db.cursor()
+        cursor.execute('USE chromeos_autotest_db')
+    else:
+        # Autotest MySQL database.
+        logging.debug('Connecting to Autotest MySQL database...')
+        if MySQLdb is None:
+            logging.error('MySQL requested, but cannot locate MySQL '
+                          'dependencies. Have you set up MySQL on this '
+                          'machine?')
+            sys.exit(1)
+        db = MySQLdb.connect(**_MYSQL_READONLY_LOGIN_CREDENTIALS)
+        cursor = db.cursor()
+
+    logging.debug('Database connection complete.')
+
+    if not os.path.isdir(_DEST_DATA_DIR):
+        os.makedirs(_DEST_DATA_DIR)
+
+    common.die_if_already_running(_CURR_PID_FILE, logging)
+    num_newly_added = extract_new_perf_data(cursor, options)
+    cleanup()
+    logging.info('Done! Added info for %d new job IDs', num_newly_added)
+
+
+if __name__ == '__main__':
+    main()