| #!/usr/bin/env python |
| # Copyright 2015 The Chromium OS Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Create e-mail reports of the Lab's DUT inventory. |
| |
| Gathers a list of all DUTs of interest in the Lab, segregated by |
| model and pool, and determines whether each DUT is working or |
| broken. Then, send one or more e-mail reports summarizing the |
| status to e-mail addresses provided on the command line. |
| |
| usage: lab_inventory.py [ options ] [ model ... ] |
| |
| Options: |
| --duration / -d <hours> |
| How far back in time to search job history to determine DUT |
| status. |
| |
| --model-notify <address>[,<address>] |
| Send the "model status" e-mail to all the specified e-mail |
| addresses. |
| |
| --pool-notify <address>[,<address>] |
| Send the "pool status" e-mail to all the specified e-mail |
| addresses. |
| |
| --recommend <number> |
| When generating the "model status" e-mail, include a list of |
| <number> specific DUTs to be recommended for repair. |
| |
| --report-untestable |
| Scan the inventory for DUTs that can't test because they're stuck in |
| repair loops, or because the scheduler can't give them work. |
| |
| --logdir <directory> |
| Log progress and actions in a file under this directory. Text |
| of any e-mail sent will also be logged in a timestamped file in |
| this directory. |
| |
| --debug |
| Suppress all logging, metrics reporting, and sending e-mail. |
| Instead, write the output that would be generated onto stdout. |
| |
| <model> arguments: |
| With no arguments, gathers the status for all models in the lab. |
| With one or more named models on the command line, restricts |
| reporting to just those models. |
| |
| """ |
| |
| |
| import argparse |
| import collections |
| import logging |
| import logging.handlers |
| import os |
| import re |
| import sys |
| import time |
| |
| import common |
| from autotest_lib.client.bin import utils |
| from autotest_lib.client.common_lib import time_utils |
| from autotest_lib.server import constants |
| from autotest_lib.server import site_utils |
| from autotest_lib.server.cros.dynamic_suite import frontend_wrappers |
| from autotest_lib.server.hosts import servo_host |
| from autotest_lib.server.lib import status_history |
| from autotest_lib.site_utils import gmail_lib |
| from chromite.lib import metrics |
| |
| |
| CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS |
| SPARE_POOL = constants.Pools.SPARE_POOL |
| MANAGED_POOLS = constants.Pools.MANAGED_POOLS |
| |
| # _EXCLUDED_LABELS - A set of labels that disqualify a DUT from |
| # monitoring by this script. Currently, we're excluding these: |
| # + 'adb' - We're not ready to monitor Android or Brillo hosts. |
| # + 'board:guado_moblab' - These are maintained by a separate |
| # process that doesn't use this script. |
| |
| _EXCLUDED_LABELS = {'adb', 'board:guado_moblab'} |
| |
| # _DEFAULT_DURATION: |
| # Default value used for the --duration command line option. |
| # Specifies how far back in time to search in order to determine |
| # DUT status. |
| |
| _DEFAULT_DURATION = 24 |
| |
| # _LOGDIR: |
| # Relative path used in the calculation of the default setting for |
| # the --logdir option. The full path is relative to the root of the |
| # autotest directory, as determined from sys.argv[0]. |
| # _LOGFILE: |
| # Basename of a file to which general log information will be |
| # written. |
| # _LOG_FORMAT: |
| # Format string for log messages. |
| |
| _LOGDIR = os.path.join('logs', 'dut-data') |
| _LOGFILE = 'lab-inventory.log' |
| _LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s' |
| |
| # Pattern describing location-based host names in the Chrome OS test |
| # labs. Each DUT hostname designates the DUT's location: |
| # * A lab (room) that's physically separated from other labs |
| # (i.e. there's a door). |
| # * A row (or aisle) of DUTs within the lab. |
| # * A vertical rack of shelves on the row. |
| # * A specific host on one shelf of the rack. |
| |
| _HOSTNAME_PATTERN = re.compile( |
| r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)') |
| |
| # _REPAIR_LOOP_THRESHOLD: |
| # The number of repeated Repair tasks that must be seen to declare |
| # that a DUT is stuck in a repair loop. |
| |
| _REPAIR_LOOP_THRESHOLD = 4 |
| |
| |
| _UNTESTABLE_PRESENCE_METRIC = metrics.BooleanMetric( |
| 'chromeos/autotest/inventory/untestable', |
| 'DUTs that cannot be scheduled for testing') |
| |
| |
| class _HostSetInventory(object): |
| """Maintains a set of related `HostJobHistory` objects. |
| |
| The collection is segregated into disjoint categories of "working", |
| "broken", and "idle" DUTs. Accessor methods allow finding both the |
| list of DUTs in each category, as well as counts of each category. |
| |
| Performance note: Certain methods in this class are potentially |
| expensive: |
| * `get_working()` |
| * `get_working_list()` |
| * `get_broken()` |
| * `get_broken_list()` |
| * `get_idle()` |
| * `get_idle_list()` |
| The first time any one of these methods is called, it causes |
| multiple RPC calls with a relatively expensive set of database |
| queries. However, the results of the queries are cached in the |
| individual `HostJobHistory` objects, so only the first call |
| actually pays the full cost. |
| |
| Additionally, `get_working_list()`, `get_broken_list()` and |
| `get_idle_list()` cache their return values to avoid recalculating |
| lists at every call; this caching is separate from the caching of |
| RPC results described above. |
| |
| This class is deliberately constructed to delay the RPC cost until |
| the accessor methods are called (rather than to query in |
| `record_host()`) so that it's possible to construct a complete |
| `_LabInventory` without making the expensive queries at creation |
| time. `_populate_model_counts()`, below, assumes this behavior. |
| |
| Current usage of this class is that all DUTs are part of a single |
| scheduling pool of DUTs; however, this class make no assumptions |
| about the actual relationship among the DUTs. |
| """ |
| |
| def __init__(self): |
| self._histories = [] |
| self._working_list = None |
| self._broken_list = None |
| self._idle_list = None |
| |
| |
| def record_host(self, host_history): |
| """Add one `HostJobHistory` object to the collection. |
| |
| @param host_history The `HostJobHistory` object to be |
| remembered. |
| |
| """ |
| self._working_list = None |
| self._broken_list = None |
| self._idle_list = None |
| self._histories.append(host_history) |
| |
| |
| def get_working_list(self): |
| """Return a list of all working DUTs in the pool. |
| |
| Filter `self._histories` for histories where the last |
| diagnosis is `WORKING`. |
| |
| Cache the result so that we only cacluate it once. |
| |
| @return A list of HostJobHistory objects. |
| |
| """ |
| if self._working_list is None: |
| self._working_list = [h for h in self._histories |
| if h.last_diagnosis()[0] == status_history.WORKING] |
| return self._working_list |
| |
| |
| def get_working(self): |
| """Return the number of working DUTs in the pool.""" |
| return len(self.get_working_list()) |
| |
| |
| def get_broken_list(self): |
| """Return a list of all broken DUTs in the pool. |
| |
| Filter `self._histories` for histories where the last |
| diagnosis is `BROKEN`. |
| |
| Cache the result so that we only cacluate it once. |
| |
| @return A list of HostJobHistory objects. |
| |
| """ |
| if self._broken_list is None: |
| self._broken_list = [h for h in self._histories |
| if h.last_diagnosis()[0] == status_history.BROKEN] |
| return self._broken_list |
| |
| |
| def get_broken(self): |
| """Return the number of broken DUTs in the pool.""" |
| return len(self.get_broken_list()) |
| |
| |
| def get_idle_list(self): |
| """Return a list of all idle DUTs in the pool. |
| |
| Filter `self._histories` for histories where the last |
| diagnosis is `UNUSED` or `UNKNOWN`. |
| |
| Cache the result so that we only cacluate it once. |
| |
| @return A list of HostJobHistory objects. |
| |
| """ |
| idle_statuses = {status_history.UNUSED, status_history.UNKNOWN} |
| if self._idle_list is None: |
| self._idle_list = [h for h in self._histories |
| if h.last_diagnosis()[0] in idle_statuses] |
| return self._idle_list |
| |
| |
| def get_idle(self): |
| """Return the number of idle DUTs in the pool.""" |
| return len(self.get_idle_list()) |
| |
| |
| def get_total(self): |
| """Return the total number of DUTs in the pool.""" |
| return len(self._histories) |
| |
| |
| class _PoolSetInventory(object): |
| """Maintains a set of `HostJobHistory`s for a set of pools. |
| |
| The collection is segregated into disjoint categories of "working", |
| "broken", and "idle" DUTs. Accessor methods allow finding both the |
| list of DUTs in each category, as well as counts of each category. |
| Accessor queries can be for an individual pool, or against all |
| pools. |
| |
| Performance note: This class relies on `_HostSetInventory`. Public |
| methods in this class generally rely on methods of the same name in |
| the underlying class, and so will have the same underlying |
| performance characteristics. |
| """ |
| |
| def __init__(self, pools): |
| self._histories_by_pool = { |
| pool: _HostSetInventory() for pool in pools |
| } |
| |
| def record_host(self, host_history): |
| """Add one `HostJobHistory` object to the collection. |
| |
| @param host_history The `HostJobHistory` object to be |
| remembered. |
| |
| """ |
| pool = host_history.host_pool |
| self._histories_by_pool[pool].record_host(host_history) |
| |
| |
| def _count_pool(self, get_pool_count, pool=None): |
| """Internal helper to count hosts in a given pool. |
| |
| The `get_pool_count` parameter is a function to calculate |
| the exact count of interest for the pool. |
| |
| @param get_pool_count Function to return a count from a |
| _PoolCount object. |
| @param pool The pool to be counted. If `None`, |
| return the total across all pools. |
| |
| """ |
| if pool is None: |
| return sum([get_pool_count(cached_history) for cached_history in |
| self._histories_by_pool.values()]) |
| else: |
| return get_pool_count(self._histories_by_pool[pool]) |
| |
| |
| def get_working_list(self): |
| """Return a list of all working DUTs (across all pools). |
| |
| Go through all HostJobHistory objects across all pools, selecting the |
| ones where the last diagnosis is `WORKING`. |
| |
| @return A list of HostJobHistory objects. |
| |
| """ |
| l = [] |
| for p in self._histories_by_pool.values(): |
| l.extend(p.get_working_list()) |
| return l |
| |
| |
| def get_working(self, pool=None): |
| """Return the number of working DUTs in a pool. |
| |
| @param pool The pool to be counted. If `None`, return the |
| total across all pools. |
| |
| @return The total number of working DUTs in the selected |
| pool(s). |
| """ |
| return self._count_pool(_HostSetInventory.get_working, pool) |
| |
| |
| def get_broken_list(self): |
| """Return a list of all broken DUTs (across all pools). |
| |
| Go through all HostJobHistory objects in the across all pools, |
| selecting the ones where the last diagnosis is `BROKEN`. |
| |
| @return A list of HostJobHistory objects. |
| |
| """ |
| l = [] |
| for p in self._histories_by_pool.values(): |
| l.extend(p.get_broken_list()) |
| return l |
| |
| |
| def get_broken(self, pool=None): |
| """Return the number of broken DUTs in a pool. |
| |
| @param pool The pool to be counted. If `None`, return the |
| total across all pools. |
| |
| @return The total number of broken DUTs in the selected pool(s). |
| """ |
| return self._count_pool(_HostSetInventory.get_broken, pool) |
| |
| |
| def get_idle_list(self, pool=None): |
| """Return a list of all idle DUTs in the given pool. |
| |
| Go through all HostJobHistory objects in the given pool, selecting the |
| ones where the last diagnosis is `UNUSED` or `UNKNOWN`. |
| |
| @param pool: The pool to be counted. If `None`, return the total list |
| across all pools. |
| |
| @return A list of HostJobHistory objects. |
| |
| """ |
| if pool is None: |
| l = [] |
| for p in self._histories_by_pool.itervalues(): |
| l.extend(p.get_idle_list()) |
| return l |
| else: |
| return self._histories_by_pool[pool].get_idle_list() |
| |
| |
| def get_idle(self, pool=None): |
| """Return the number of idle DUTs in a pool. |
| |
| @param pool: The pool to be counted. If `None`, return the total |
| across all pools. |
| |
| @return The total number of idle DUTs in the selected pool(s). |
| """ |
| return self._count_pool(_HostSetInventory.get_idle, pool) |
| |
| |
| def get_spares_buffer(self, spare_pool=SPARE_POOL): |
| """Return the the nominal number of working spares. |
| |
| Calculates and returns how many working spares there would |
| be in the spares pool if all broken DUTs were in the spares |
| pool. This number may be negative, indicating a shortfall |
| in the critical pools. |
| |
| @return The total number DUTs in the spares pool, less the total |
| number of broken DUTs in all pools. |
| """ |
| return self.get_total(spare_pool) - self.get_broken() |
| |
| |
| def get_total(self, pool=None): |
| """Return the total number of DUTs in a pool. |
| |
| @param pool The pool to be counted. If `None`, return the |
| total across all pools. |
| |
| @return The total number of DUTs in the selected pool(s). |
| """ |
| return self._count_pool(_HostSetInventory.get_total, pool) |
| |
| |
| def _eligible_host(afehost): |
| """Return whether this host is eligible for monitoring. |
| |
| A host is eligible if it has a (unique) 'model' label, it's in |
| exactly one pool, and it has no labels from the |
| `_EXCLUDED_LABELS` set. |
| |
| @param afehost The host to be tested for eligibility. |
| """ |
| # DUTs without an existing, unique 'model' or 'pool' label |
| # aren't meant to exist in the managed inventory; their presence |
| # generally indicates an error in the database. Unfortunately |
| # such errors have been seen to occur from time to time. |
| # |
| # The _LabInventory constructor requires hosts to conform to the |
| # label restrictions, and may fail if they don't. Failing an |
| # inventory run for a single bad entry is the wrong thing, so we |
| # ignore the problem children here, to keep them out of the |
| # inventory. |
| models = [l for l in afehost.labels |
| if l.startswith(constants.Labels.MODEL_PREFIX)] |
| pools = [l for l in afehost.labels |
| if l.startswith(constants.Labels.POOL_PREFIX)] |
| excluded = _EXCLUDED_LABELS.intersection(afehost.labels) |
| return len(models) == 1 and len(pools) == 1 and not excluded |
| |
| |
| class _LabInventory(collections.Mapping): |
| """Collection of `HostJobHistory` objects for the Lab's inventory. |
| |
| This is a dict-like collection indexed by model. Indexing returns |
| the _PoolSetInventory object associated with the model. |
| """ |
| |
| @classmethod |
| def create_inventory(cls, afe, start_time, end_time, modellist=[]): |
| """Return a Lab inventory with specified parameters. |
| |
| By default, gathers inventory from `HostJobHistory` objects for |
| all DUTs in the `MANAGED_POOLS` list. If `modellist` is |
| supplied, the inventory will be restricted to only the given |
| models. |
| |
| @param afe AFE object for constructing the |
| `HostJobHistory` objects. |
| @param start_time Start time for the `HostJobHistory` objects. |
| @param end_time End time for the `HostJobHistory` objects. |
| @param modellist List of models to include. If empty, |
| include all available models. |
| @return A `_LabInventory` object for the specified models. |
| |
| """ |
| target_pools = MANAGED_POOLS |
| label_list = [constants.Labels.POOL_PREFIX + l for l in target_pools] |
| afehosts = afe.get_hosts(labels__name__in=label_list) |
| if modellist: |
| # We're deliberately not checking host eligibility in this |
| # code path. This is a debug path, not used in production; |
| # it may be useful to include ineligible hosts here. |
| modelhosts = [] |
| for model in modellist: |
| model_label = constants.Labels.MODEL_PREFIX + model |
| host_list = [h for h in afehosts |
| if model_label in h.labels] |
| modelhosts.extend(host_list) |
| afehosts = modelhosts |
| else: |
| afehosts = [h for h in afehosts if _eligible_host(h)] |
| create = lambda host: ( |
| status_history.HostJobHistory(afe, host, |
| start_time, end_time)) |
| return cls([create(host) for host in afehosts], target_pools) |
| |
| |
| def __init__(self, histories, pools): |
| models = {h.host_model for h in histories} |
| self._modeldata = {model: _PoolSetInventory(pools) for model in models} |
| self._dut_count = len(histories) |
| for h in histories: |
| self[h.host_model].record_host(h) |
| self._boards = {h.host_board for h in histories} |
| |
| |
| def __getitem__(self, key): |
| return self._modeldata.__getitem__(key) |
| |
| |
| def __len__(self): |
| return self._modeldata.__len__() |
| |
| |
| def __iter__(self): |
| return self._modeldata.__iter__() |
| |
| |
| def reportable_items(self, spare_pool=SPARE_POOL): |
| """Iterate over all items subject to reporting. |
| |
| Yields the contents of `self.iteritems()` filtered to include |
| only reportable models. A model is reportable if it has DUTs in |
| both `spare_pool` and at least one other pool. |
| |
| @param spare_pool The spare pool to be tested for reporting. |
| """ |
| for model, histories in self.iteritems(): |
| spares = histories.get_total(spare_pool) |
| total = histories.get_total() |
| if spares != 0 and spares != total: |
| yield model, histories |
| |
| |
| def get_num_duts(self): |
| """Return the total number of DUTs in the inventory.""" |
| return self._dut_count |
| |
| |
| def get_num_models(self): |
| """Return the total number of models in the inventory.""" |
| return len(self) |
| |
| |
| def get_pool_models(self, pool): |
| """Return all models in `pool`. |
| |
| @param pool The pool to be inventoried for models. |
| """ |
| return {m for m, h in self.iteritems() if h.get_total(pool)} |
| |
| |
| def get_boards(self): |
| return self._boards |
| |
| |
| def _sort_by_location(inventory_list): |
| """Return a list of DUTs, organized by location. |
| |
| Take the given list of `HostJobHistory` objects, separate it |
| into a list per lab, and sort each lab's list by location. The |
| order of sorting within a lab is |
| * By row number within the lab, |
| * then by rack number within the row, |
| * then by host shelf number within the rack. |
| |
| Return a list of the sorted lists. |
| |
| Implementation note: host locations are sorted by converting |
| each location into a base 100 number. If row, rack or |
| host numbers exceed the range [0..99], then sorting will |
| break down. |
| |
| @return A list of sorted lists of DUTs. |
| |
| """ |
| BASE = 100 |
| lab_lists = {} |
| for history in inventory_list: |
| location = _HOSTNAME_PATTERN.match(history.host.hostname) |
| if location: |
| lab = location.group(1) |
| key = 0 |
| for idx in location.group(2, 3, 4): |
| key = BASE * key + int(idx) |
| lab_lists.setdefault(lab, []).append((key, history)) |
| return_list = [] |
| for dut_list in lab_lists.values(): |
| dut_list.sort(key=lambda t: t[0]) |
| return_list.append([t[1] for t in dut_list]) |
| return return_list |
| |
| |
| def _score_repair_set(buffer_counts, repair_list): |
| """Return a numeric score rating a set of DUTs to be repaired. |
| |
| `buffer_counts` is a dictionary mapping model names to the size of |
| the model's spares buffer. |
| |
| `repair_list` is a list of `HostJobHistory` objects for the DUTs to |
| be repaired. |
| |
| This function calculates the new set of buffer counts that would |
| result from the proposed repairs, and scores the new set using two |
| numbers: |
| * Worst case buffer count for any model (higher is better). This |
| is the more significant number for comparison. |
| * Number of models at the worst case (lower is better). This is |
| the less significant number. |
| |
| Implementation note: The score could fail to reflect the intended |
| criteria if there are more than 1000 models in the inventory. |
| |
| @param spare_counts A dictionary mapping models to buffer counts. |
| @param repair_list A list of `HostJobHistory` objects for the |
| DUTs to be repaired. |
| @return A numeric score. |
| """ |
| # Go through `buffer_counts`, and create a list of new counts |
| # that records the buffer count for each model after repair. |
| # The new list of counts discards the model names, as they don't |
| # contribute to the final score. |
| _NMODELS = 1000 |
| pools = {h.host_pool for h in repair_list} |
| repair_inventory = _LabInventory(repair_list, pools) |
| new_counts = [] |
| for m, c in buffer_counts.iteritems(): |
| if m in repair_inventory: |
| newcount = repair_inventory[m].get_total() |
| else: |
| newcount = 0 |
| new_counts.append(c + newcount) |
| # Go through the new list of counts. Find the worst available |
| # spares count, and count how many times that worst case occurs. |
| worst_count = new_counts[0] |
| num_worst = 1 |
| for c in new_counts[1:]: |
| if c == worst_count: |
| num_worst += 1 |
| elif c < worst_count: |
| worst_count = c |
| num_worst = 1 |
| # Return the calculated score |
| return _NMODELS * worst_count - num_worst |
| |
| |
| def _generate_repair_recommendation(inventory, num_recommend): |
| """Return a summary of selected DUTs needing repair. |
| |
| Returns a message recommending a list of broken DUTs to be repaired. |
| The list of DUTs is selected based on these criteria: |
| * No more than `num_recommend` DUTs will be listed. |
| * All DUTs must be in the same lab. |
| * DUTs should be selected for some degree of physical proximity. |
| * DUTs for models with a low spares buffer are more important than |
| DUTs with larger buffers. |
| |
| The algorithm used will guarantee that at least one DUT from a model |
| with the lowest spares buffer will be recommended. If the worst |
| spares buffer number is shared by more than one model, the algorithm |
| will tend to prefer repair sets that include more of those models |
| over sets that cover fewer models. |
| |
| @param inventory `_LabInventory` object from which to generate |
| recommendations. |
| @param num_recommend Number of DUTs to recommend for repair. |
| |
| """ |
| logging.debug('Creating DUT repair recommendations') |
| model_buffer_counts = {} |
| broken_list = [] |
| for model, counts in inventory.reportable_items(): |
| logging.debug('Listing failed DUTs for %s', model) |
| if counts.get_broken() != 0: |
| model_buffer_counts[model] = counts.get_spares_buffer() |
| broken_list.extend(counts.get_broken_list()) |
| # N.B. The logic inside this loop may seem complicated, but |
| # simplification is hard: |
| # * Calculating an initial recommendation outside of |
| # the loop likely would make things more complicated, |
| # not less. |
| # * It's necessary to calculate an initial lab slice once per |
| # lab _before_ the while loop, in case the number of broken |
| # DUTs in a lab is less than `num_recommend`. |
| recommendation = None |
| best_score = None |
| for lab_duts in _sort_by_location(broken_list): |
| start = 0 |
| end = num_recommend |
| lab_slice = lab_duts[start : end] |
| lab_score = _score_repair_set(model_buffer_counts, lab_slice) |
| while end < len(lab_duts): |
| start += 1 |
| end += 1 |
| new_slice = lab_duts[start : end] |
| new_score = _score_repair_set(model_buffer_counts, new_slice) |
| if new_score > lab_score: |
| lab_slice = new_slice |
| lab_score = new_score |
| if recommendation is None or lab_score > best_score: |
| recommendation = lab_slice |
| best_score = lab_score |
| # N.B. The trailing space in `line_fmt` is manadatory: Without it, |
| # Gmail will parse the URL wrong. Don't ask. If you simply _must_ |
| # know more, go try it yourself... |
| line_fmt = '%-30s %-16s %-6s\n %s ' |
| message = ['Repair recommendations:\n', |
| line_fmt % ( 'Hostname', 'Model', 'Servo?', 'Logs URL')] |
| for h in recommendation: |
| servo_name = servo_host.make_servo_hostname(h.host.hostname) |
| servo_present = utils.host_is_in_lab_zone(servo_name) |
| _, event = h.last_diagnosis() |
| line = line_fmt % ( |
| h.host.hostname, h.host_model, |
| 'Yes' if servo_present else 'No', event.job_url) |
| message.append(line) |
| return '\n'.join(message) |
| |
| |
| def _generate_model_inventory_message(inventory): |
| """Generate the "model inventory" e-mail message. |
| |
| The model inventory is a list by model summarizing the number of |
| working, broken, and idle DUTs, and the total shortfall or surplus |
| of working devices relative to the minimum critical pool |
| requirement. |
| |
| The report omits models with no DUTs in the spare pool or with no |
| DUTs in a critical pool. |
| |
| N.B. For sample output text formattted as users can expect to |
| see it in e-mail and log files, refer to the unit tests. |
| |
| @param inventory `_LabInventory` object to be reported on. |
| @return String with the inventory message to be sent. |
| """ |
| logging.debug('Creating model inventory') |
| nworking = 0 |
| nbroken = 0 |
| nidle = 0 |
| nbroken_models = 0 |
| ntotal_models = 0 |
| summaries = [] |
| column_names = ( |
| 'Model', 'Avail', 'Bad', 'Idle', 'Good', 'Spare', 'Total') |
| for model, counts in inventory.reportable_items(): |
| logging.debug('Counting %2d DUTS for model %s', |
| counts.get_total(), model) |
| # Summary elements laid out in the same order as the column |
| # headers: |
| # Model Avail Bad Idle Good Spare Total |
| # e[0] e[1] e[2] e[3] e[4] e[5] e[6] |
| element = (model, |
| counts.get_spares_buffer(), |
| counts.get_broken(), |
| counts.get_idle(), |
| counts.get_working(), |
| counts.get_total(SPARE_POOL), |
| counts.get_total()) |
| if element[2]: |
| summaries.append(element) |
| nbroken_models += 1 |
| ntotal_models += 1 |
| nbroken += element[2] |
| nidle += element[3] |
| nworking += element[4] |
| ntotal = nworking + nbroken + nidle |
| summaries = sorted(summaries, key=lambda e: (e[1], -e[2])) |
| broken_percent = int(round(100.0 * nbroken / ntotal)) |
| idle_percent = int(round(100.0 * nidle / ntotal)) |
| working_percent = 100 - broken_percent - idle_percent |
| message = ['Summary of DUTs in inventory:', |
| '%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'), |
| '%5d %3d%% %5d %3d%% %5d %3d%% %6d' % ( |
| nbroken, broken_percent, |
| nidle, idle_percent, |
| nworking, working_percent, |
| ntotal), |
| '', |
| 'Models with failures: %d' % nbroken_models, |
| 'Models in inventory: %d' % ntotal_models, |
| '', '', |
| 'Full model inventory:\n', |
| '%-22s %5s %5s %5s %5s %5s %5s' % column_names] |
| message.extend( |
| ['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries]) |
| return '\n'.join(message) |
| |
| |
| _POOL_INVENTORY_HEADER = '''\ |
| Notice to Infrastructure deputies: All models shown below are at |
| less than full strength, please take action to resolve the issues. |
| Once you're satisified that failures won't recur, failed DUTs can |
| be replaced with spares by running `balance_pool`. Detailed |
| instructions can be found here: |
| http://go/cros-manage-duts |
| ''' |
| |
| |
| def _generate_pool_inventory_message(inventory): |
| """Generate the "pool inventory" e-mail message. |
| |
| The pool inventory is a list by pool and model summarizing the |
| number of working and broken DUTs in the pool. Only models with |
| at least one broken DUT are included in the list. |
| |
| N.B. For sample output text formattted as users can expect to see it |
| in e-mail and log files, refer to the unit tests. |
| |
| @param inventory `_LabInventory` object to be reported on. |
| @return String with the inventory message to be sent. |
| """ |
| logging.debug('Creating pool inventory') |
| message = [_POOL_INVENTORY_HEADER] |
| newline = '' |
| for pool in CRITICAL_POOLS: |
| message.append( |
| '%sStatus for pool:%s, by model:' % (newline, pool)) |
| message.append( |
| '%-20s %5s %5s %5s %5s' % ( |
| 'Model', 'Bad', 'Idle', 'Good', 'Total')) |
| data_list = [] |
| for model, counts in inventory.iteritems(): |
| logging.debug('Counting %2d DUTs for %s, %s', |
| counts.get_total(pool), model, pool) |
| broken = counts.get_broken(pool) |
| idle = counts.get_idle(pool) |
| # models at full strength are not reported |
| if not broken and not idle: |
| continue |
| working = counts.get_working(pool) |
| total = counts.get_total(pool) |
| data_list.append((model, broken, idle, working, total)) |
| if data_list: |
| data_list = sorted(data_list, key=lambda d: -d[1]) |
| message.extend( |
| ['%-20s %5d %5d %5d %5d' % t for t in data_list]) |
| else: |
| message.append('(All models at full strength)') |
| newline = '\n' |
| return '\n'.join(message) |
| |
| |
| _IDLE_INVENTORY_HEADER = '''\ |
| Notice to Infrastructure deputies: The hosts shown below haven't |
| run any jobs for at least 24 hours. Please check each host; locked |
| hosts should normally be unlocked; stuck jobs should normally be |
| aborted. |
| ''' |
| |
| |
| def _generate_idle_inventory_message(inventory): |
| """Generate the "idle inventory" e-mail message. |
| |
| The idle inventory is a host list with corresponding pool and model, |
| where the hosts are idle (`UNKWOWN` or `UNUSED`). |
| |
| N.B. For sample output text format as users can expect to |
| see it in e-mail and log files, refer to the unit tests. |
| |
| @param inventory `_LabInventory` object to be reported on. |
| @return String with the inventory message to be sent. |
| |
| """ |
| logging.debug('Creating idle inventory') |
| message = [_IDLE_INVENTORY_HEADER] |
| message.append('Idle Host List:') |
| message.append('%-30s %-20s %s' % ('Hostname', 'Model', 'Pool')) |
| data_list = [] |
| for pool in MANAGED_POOLS: |
| for model, counts in inventory.iteritems(): |
| logging.debug('Counting %2d DUTs for %s, %s', |
| counts.get_total(pool), model, pool) |
| data_list.extend([(dut.host.hostname, model, pool) |
| for dut in counts.get_idle_list(pool)]) |
| if data_list: |
| message.extend(['%-30s %-20s %s' % t for t in data_list]) |
| else: |
| message.append('(No idle DUTs)') |
| return '\n'.join(message) |
| |
| |
| def _send_email(arguments, tag, subject, recipients, body): |
| """Send an inventory e-mail message. |
| |
| The message is logged in the selected log directory using `tag` for |
| the file name. |
| |
| If the --debug option was requested, the message is neither logged |
| nor sent, but merely printed on stdout. |
| |
| @param arguments Parsed command-line options. |
| @param tag Tag identifying the inventory for logging |
| purposes. |
| @param subject E-mail Subject: header line. |
| @param recipients E-mail addresses for the To: header line. |
| @param body E-mail message body. |
| """ |
| logging.debug('Generating email: "%s"', subject) |
| all_recipients = ', '.join(recipients) |
| report_body = '\n'.join([ |
| 'To: %s' % all_recipients, |
| 'Subject: %s' % subject, |
| '', body, '']) |
| if arguments.debug: |
| print report_body |
| else: |
| filename = os.path.join(arguments.logdir, tag) |
| try: |
| report_file = open(filename, 'w') |
| report_file.write(report_body) |
| report_file.close() |
| except EnvironmentError as e: |
| logging.error('Failed to write %s: %s', filename, e) |
| try: |
| gmail_lib.send_email(all_recipients, subject, body) |
| except Exception as e: |
| logging.error('Failed to send e-mail to %s: %s', |
| all_recipients, e) |
| |
| |
| def _populate_model_counts(inventory): |
| """Gather model counts while providing interactive feedback. |
| |
| Gathering the status of all individual DUTs in the lab can take |
| considerable time (~30 minutes at the time of this writing). |
| Normally, we pay that cost by querying as we go. However, with |
| the `--debug` option, we expect a human being to be watching the |
| progress in real time. So, we force the first (expensive) queries |
| to happen up front, and provide simple ASCII output on sys.stdout |
| to show a progress bar and results. |
| |
| @param inventory `_LabInventory` object from which to gather |
| counts. |
| """ |
| n = 0 |
| total_broken = 0 |
| for counts in inventory.itervalues(): |
| n += 1 |
| if n % 10 == 5: |
| c = '+' |
| elif n % 10 == 0: |
| c = '%d' % ((n / 10) % 10) |
| else: |
| c = '.' |
| sys.stdout.write(c) |
| sys.stdout.flush() |
| # This next call is where all the time goes - it forces all of a |
| # model's `HostJobHistory` objects to query the database and |
| # cache their results. |
| total_broken += counts.get_broken() |
| sys.stdout.write('\n') |
| sys.stdout.write('Found %d broken DUTs\n' % total_broken) |
| |
| |
| def _perform_model_inventory(arguments, inventory, timestamp): |
| """Perform the model inventory report. |
| |
| The model inventory report consists of the following: |
| * A list of DUTs that are recommended to be repaired. This list |
| is optional, and only appears if the `--recommend` option is |
| present. |
| * A list of all models that have failed DUTs, with counts |
| of working, broken, and spare DUTs, among others. |
| |
| @param arguments Command-line arguments as returned by |
| `ArgumentParser` |
| @param inventory `_LabInventory` object to be reported on. |
| @param timestamp A string used to identify this run's timestamp |
| in logs and email output. |
| """ |
| if arguments.recommend: |
| recommend_message = _generate_repair_recommendation( |
| inventory, arguments.recommend) + '\n\n\n' |
| else: |
| recommend_message = '' |
| model_message = _generate_model_inventory_message(inventory) |
| _send_email(arguments, |
| 'models-%s.txt' % timestamp, |
| 'DUT model inventory %s' % timestamp, |
| arguments.model_notify, |
| recommend_message + model_message) |
| |
| |
| def _perform_pool_inventory(arguments, inventory, timestamp): |
| """Perform the pool inventory report. |
| |
| The pool inventory report consists of the following: |
| * A list of all critical pools that have failed DUTs, with counts |
| of working, broken, and idle DUTs. |
| * A list of all idle DUTs by hostname including the model and |
| pool. |
| |
| @param arguments Command-line arguments as returned by |
| `ArgumentParser` |
| @param inventory `_LabInventory` object to be reported on. |
| @param timestamp A string used to identify this run's timestamp in |
| logs and email output. |
| """ |
| pool_message = _generate_pool_inventory_message(inventory) |
| idle_message = _generate_idle_inventory_message(inventory) |
| _send_email(arguments, |
| 'pools-%s.txt' % timestamp, |
| 'DUT pool inventory %s' % timestamp, |
| arguments.pool_notify, |
| pool_message + '\n\n\n' + idle_message) |
| |
| |
| def _dut_in_repair_loop(history): |
| """Return whether a DUT's history indicates a repair loop. |
| |
| A DUT is considered looping if it runs no tests, and no tasks pass |
| other than repair tasks. |
| |
| @param history An instance of `status_history.HostJobHistory` to be |
| scanned for a repair loop. The caller guarantees |
| that this history corresponds to a working DUT. |
| @returns Return a true value if the DUT's most recent history |
| indicates a repair loop. |
| """ |
| # Our caller passes only histories for working DUTs; that means |
| # we've already paid the cost of fetching the diagnosis task, and |
| # we know that the task was successful. The diagnosis task will be |
| # one of the tasks we must scan to find a loop, so if the task isn't |
| # a repair task, then our history includes a successful non-repair |
| # task, and we're not looping. |
| # |
| # The for loop below is very expensive, because it must fetch the |
| # full history, regardless of how many tasks we examine. At the |
| # time of this writing, this check against the diagnosis task |
| # reduces the cost of finding loops in the full inventory from hours |
| # to minutes. |
| if history.last_diagnosis()[1].name != 'Repair': |
| return False |
| repair_ok_count = 0 |
| for task in history: |
| if not task.is_special: |
| # This is a test, so we're not looping. |
| return False |
| if task.diagnosis == status_history.BROKEN: |
| # Failed a repair, so we're not looping. |
| return False |
| if (task.diagnosis == status_history.WORKING |
| and task.name != 'Repair'): |
| # Non-repair task succeeded, so we're not looping. |
| return False |
| # At this point, we have either a failed non-repair task, or |
| # a successful repair. |
| if task.name == 'Repair': |
| repair_ok_count += 1 |
| if repair_ok_count >= _REPAIR_LOOP_THRESHOLD: |
| return True |
| |
| |
| def _report_untestable_dut(history, state): |
| fields = { |
| 'dut_hostname': history.hostname, |
| 'model': history.host_model, |
| 'pool': history.host_pool, |
| 'state': state, |
| } |
| logging.info('Untestable DUT: %(dut_hostname)s, model: %(model)s, ' |
| 'pool: %(pool)s', fields) |
| _UNTESTABLE_PRESENCE_METRIC.set(True, fields=fields) |
| |
| |
| def _report_repair_loop_metrics(inventory): |
| """Find and report DUTs stuck in a repair loop. |
| |
| Go through `inventory`, and find and report any DUT identified as |
| being in a repair loop. |
| |
| @param inventory `_LabInventory` object to be reported on. |
| """ |
| logging.info('Scanning for DUTs in repair loops.') |
| for counts in inventory.itervalues(): |
| for history in counts.get_working_list(): |
| # Managed DUTs with names that don't match |
| # _HOSTNAME_PATTERN shouldn't be possible. However, we |
| # don't want arbitrary strings being attached to the |
| # 'dut_hostname' field, so for safety, we exclude all |
| # anomalies. |
| if not _HOSTNAME_PATTERN.match(history.hostname): |
| continue |
| if _dut_in_repair_loop(history): |
| _report_untestable_dut(history, 'repair_loop') |
| |
| |
| def _report_idle_dut_metrics(inventory): |
| """Find and report idle, unlocked DUTs. |
| |
| Go through `inventory`, and find and report any DUT identified as |
| "idle" that is not also locked. |
| |
| @param inventory `_LabInventory` object to be reported on. |
| """ |
| logging.info('Scanning for idle, unlocked DUTs.') |
| for counts in inventory.itervalues(): |
| for history in counts.get_idle_list(): |
| # Managed DUTs with names that don't match |
| # _HOSTNAME_PATTERN shouldn't be possible. However, we |
| # don't want arbitrary strings being attached to the |
| # 'dut_hostname' field, so for safety, we exclude all |
| # anomalies. |
| if not _HOSTNAME_PATTERN.match(history.hostname): |
| continue |
| if not history.host.locked: |
| _report_untestable_dut(history, 'idle_unlocked') |
| |
| |
| def _report_untestable_dut_metrics(inventory): |
| """Scan the inventory for DUTs unable to run tests. |
| |
| DUTs in the inventory are judged "untestable" if they meet one of |
| two criteria: |
| * The DUT is stuck in a repair loop; that is, it regularly passes |
| repair, but never passes other operations. |
| * The DUT runs no tasks at all, but is not locked. |
| |
| This routine walks through the given inventory looking for DUTs in |
| either of these states. Results are reported via a Monarch presence |
| metric. |
| |
| Note: To make sure that DUTs aren't flagged as "idle" merely |
| because there's no work, a separate job runs prior to regular |
| inventory runs which schedules trivial work on any DUT that appears |
| idle. |
| |
| @param inventory `_LabInventory` object to be reported on. |
| """ |
| _report_repair_loop_metrics(inventory) |
| _report_idle_dut_metrics(inventory) |
| |
| |
| def _log_startup(arguments, startup_time): |
| """Log the start of this inventory run. |
| |
| Print various log messages indicating the start of the run. Return |
| a string based on `startup_time` that will be used to identify this |
| run in log files and e-mail messages. |
| |
| @param startup_time A UNIX timestamp marking the moment when |
| this inventory run began. |
| @returns A timestamp string that will be used to identify this run |
| in logs and email output. |
| """ |
| timestamp = time.strftime('%Y-%m-%d.%H', |
| time.localtime(startup_time)) |
| logging.debug('Starting lab inventory for %s', timestamp) |
| if arguments.model_notify: |
| if arguments.recommend: |
| logging.debug('Will include repair recommendations') |
| logging.debug('Will include model inventory') |
| if arguments.pool_notify: |
| logging.debug('Will include pool inventory') |
| return timestamp |
| |
| |
| def _create_inventory(arguments, end_time): |
| """Create the `_LabInventory` instance to use for reporting. |
| |
| @param end_time A UNIX timestamp for the end of the time range |
| to be searched in this inventory run. |
| """ |
| start_time = end_time - arguments.duration * 60 * 60 |
| afe = frontend_wrappers.RetryingAFE(server=None) |
| inventory = _LabInventory.create_inventory( |
| afe, start_time, end_time, arguments.modelnames) |
| logging.info('Found %d hosts across %d models', |
| inventory.get_num_duts(), |
| inventory.get_num_models()) |
| return inventory |
| |
| |
| def _perform_inventory_reports(arguments): |
| """Perform all inventory checks requested on the command line. |
| |
| Create the initial inventory and run through the inventory reports |
| as called for by the parsed command-line arguments. |
| |
| @param arguments Command-line arguments as returned by |
| `ArgumentParser`. |
| """ |
| startup_time = time.time() |
| timestamp = _log_startup(arguments, startup_time) |
| inventory = _create_inventory(arguments, startup_time) |
| if arguments.debug: |
| _populate_model_counts(inventory) |
| if arguments.model_notify: |
| _perform_model_inventory(arguments, inventory, timestamp) |
| if arguments.pool_notify: |
| _perform_pool_inventory(arguments, inventory, timestamp) |
| if arguments.report_untestable: |
| _report_untestable_dut_metrics(inventory) |
| |
| |
| def _separate_email_addresses(address_list): |
| """Parse a list of comma-separated lists of e-mail addresses. |
| |
| @param address_list A list of strings containing comma |
| separate e-mail addresses. |
| @return A list of the individual e-mail addresses. |
| |
| """ |
| newlist = [] |
| for arg in address_list: |
| newlist.extend([email.strip() for email in arg.split(',')]) |
| return newlist |
| |
| |
| def _verify_arguments(arguments): |
| """Validate command-line arguments. |
| |
| Join comma separated e-mail addresses for `--model-notify` and |
| `--pool-notify` in separate option arguments into a single list. |
| |
| For non-debug uses, require that at least one inventory report be |
| requested. For debug, if a report isn't specified, treat it as "run |
| all the reports." |
| |
| The return value indicates success or failure; in the case of |
| failure, we also write an error message to stderr. |
| |
| @param arguments Command-line arguments as returned by |
| `ArgumentParser` |
| @return True if the arguments are semantically good, or False |
| if the arguments don't meet requirements. |
| |
| """ |
| arguments.model_notify = _separate_email_addresses( |
| arguments.model_notify) |
| arguments.pool_notify = _separate_email_addresses( |
| arguments.pool_notify) |
| if not any([arguments.model_notify, arguments.pool_notify, |
| arguments.report_untestable]): |
| if not arguments.debug: |
| sys.stderr.write('Must request at least one report via ' |
| '--model-notify, --pool-notify, or ' |
| '--report-untestable\n') |
| return False |
| else: |
| # We want to run all the e-mail reports. An empty notify |
| # list will cause a report to be skipped, so make sure the |
| # lists are non-empty. |
| arguments.model_notify = [''] |
| arguments.pool_notify = [''] |
| return True |
| |
| |
| def _get_default_logdir(script): |
| """Get the default directory for the `--logdir` option. |
| |
| The default log directory is based on the parent directory |
| containing this script. |
| |
| @param script Path to this script file. |
| @return A path to a directory. |
| |
| """ |
| basedir = os.path.dirname(os.path.abspath(script)) |
| basedir = os.path.dirname(basedir) |
| return os.path.join(basedir, _LOGDIR) |
| |
| |
| def _parse_command(argv): |
| """Parse the command line arguments. |
| |
| Create an argument parser for this command's syntax, parse the |
| command line, and return the result of the ArgumentParser |
| parse_args() method. |
| |
| @param argv Standard command line argument vector; argv[0] is |
| assumed to be the command name. |
| @return Result returned by ArgumentParser.parse_args(). |
| |
| """ |
| parser = argparse.ArgumentParser( |
| prog=argv[0], |
| description='Gather and report lab inventory statistics') |
| parser.add_argument('-d', '--duration', type=int, |
| default=_DEFAULT_DURATION, metavar='HOURS', |
| help='number of hours back to search for status' |
| ' (default: %d)' % _DEFAULT_DURATION) |
| parser.add_argument('--model-notify', action='append', |
| default=[], metavar='ADDRESS', |
| help='Generate model inventory message, ' |
| 'and send it to the given e-mail address(es)') |
| parser.add_argument('--pool-notify', action='append', |
| default=[], metavar='ADDRESS', |
| help='Generate pool inventory message, ' |
| 'and send it to the given address(es)') |
| parser.add_argument('-r', '--recommend', type=int, default=None, |
| help=('Specify how many DUTs should be ' |
| 'recommended for repair (default: no ' |
| 'recommendation)')) |
| parser.add_argument('--report-untestable', action='store_true', |
| help='Check for devices unable to run tests.') |
| parser.add_argument('--debug-metrics', action='store_true', |
| help='Include debug information about the metrics ' |
| 'that would be reported ') |
| parser.add_argument('--debug', action='store_true', |
| help='Print e-mail messages on stdout ' |
| 'without sending them.') |
| parser.add_argument('--logdir', default=_get_default_logdir(argv[0]), |
| help='Directory where logs will be written.') |
| parser.add_argument('modelnames', nargs='*', |
| metavar='MODEL', |
| help='names of models to report on ' |
| '(default: all models)') |
| arguments = parser.parse_args(argv[1:]) |
| if not _verify_arguments(arguments): |
| return None |
| return arguments |
| |
| |
| def _configure_logging(arguments): |
| """Configure the `logging` module for our needs. |
| |
| How we log depends on whether the `--debug` option was provided on |
| the command line. |
| * Without the option, we configure the logging to capture all |
| potentially relevant events in a log file. The log file is |
| configured to rotate once a week on Friday evening, preserving |
| ~3 months worth of history. |
| * With the option, we expect stdout to contain other |
| human-readable output (including the contents of the e-mail |
| messages), so we restrict the output to INFO level. |
| |
| For convenience, when `--debug` is on, the logging format has |
| no adornments, so that a call like `logging.info(msg)` simply writes |
| `msg` to stdout, plus a trailing newline. |
| |
| @param arguments Command-line arguments as returned by |
| `ArgumentParser` |
| """ |
| root_logger = logging.getLogger() |
| if arguments.debug: |
| root_logger.setLevel(logging.INFO) |
| handler = logging.StreamHandler(sys.stdout) |
| handler.setFormatter(logging.Formatter()) |
| else: |
| if not os.path.exists(arguments.logdir): |
| os.mkdir(arguments.logdir) |
| root_logger.setLevel(logging.DEBUG) |
| logfile = os.path.join(arguments.logdir, _LOGFILE) |
| handler = logging.handlers.TimedRotatingFileHandler( |
| logfile, when='W4', backupCount=13) |
| formatter = logging.Formatter(_LOG_FORMAT, |
| time_utils.TIME_FMT) |
| handler.setFormatter(formatter) |
| # TODO(jrbarnette) This is gross. Importing client.bin.utils |
| # implicitly imported logging_config, which calls |
| # logging.basicConfig() *at module level*. That gives us an |
| # extra logging handler that we don't want. So, clear out all |
| # the handlers here. |
| for h in root_logger.handlers: |
| root_logger.removeHandler(h) |
| root_logger.addHandler(handler) |
| |
| |
| def main(argv): |
| """Standard main routine. |
| |
| @param argv Command line arguments, including `sys.argv[0]`. |
| """ |
| arguments = _parse_command(argv) |
| if not arguments: |
| sys.exit(1) |
| _configure_logging(arguments) |
| try: |
| if arguments.debug_metrics or not arguments.debug: |
| metrics_file = None if not arguments.debug_metrics else '/dev/null' |
| with site_utils.SetupTsMonGlobalState( |
| 'lab_inventory', debug_file=metrics_file, |
| auto_flush=False): |
| _perform_inventory_reports(arguments) |
| metrics.Flush() |
| else: |
| _perform_inventory_reports(arguments) |
| except KeyboardInterrupt: |
| pass |
| except EnvironmentError as e: |
| logging.exception('Unexpected OS error: %s', e) |
| except Exception as e: |
| logging.exception('Unexpected exception: %s', e) |
| |
| |
| def get_inventory(afe): |
| end_time = int(time.time()) |
| start_time = end_time - 24 * 60 * 60 |
| return _LabInventory.create_inventory(afe, start_time, end_time) |
| |
| |
| def get_managed_boards(afe): |
| return get_inventory(afe).get_boards() |
| |
| |
| if __name__ == '__main__': |
| main(sys.argv) |