mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 1 | """ |
| 2 | Autotest AFE Cleanup used by the scheduler |
| 3 | """ |
| 4 | |
| 5 | |
| 6 | import datetime, time, logging |
| 7 | import common |
| 8 | from autotest_lib.database import database_connection |
| 9 | from autotest_lib.frontend.afe import models |
| 10 | from autotest_lib.scheduler import email_manager, scheduler_config |
| 11 | |
| 12 | |
| 13 | class PeriodicCleanup(object): |
| 14 | |
| 15 | |
| 16 | def __init__(self, db, clean_interval, run_at_initialize=False): |
| 17 | self._db = db |
| 18 | self.clean_interval = clean_interval |
| 19 | self._last_clean_time = time.time() |
| 20 | if run_at_initialize: |
| 21 | self._cleanup() |
| 22 | |
| 23 | |
| 24 | def run_cleanup_maybe(self): |
| 25 | should_cleanup = (self._last_clean_time + self.clean_interval * 60 |
| 26 | < time.time()) |
| 27 | if should_cleanup: |
| 28 | self._cleanup() |
| 29 | self._last_clean_time = time.time() |
| 30 | |
| 31 | |
| 32 | def _cleanup(self): |
| 33 | """Abrstract cleanup method.""" |
| 34 | raise NotImplementedError |
| 35 | |
| 36 | |
| 37 | class UserCleanup(PeriodicCleanup): |
| 38 | """User cleanup that is controlled by the global config variable |
| 39 | clean_interval in the SCHEDULER section. |
| 40 | """ |
| 41 | |
| 42 | |
| 43 | def __init__(self, db, clean_interval_minutes): |
| 44 | super(UserCleanup, self).__init__(db, clean_interval_minutes) |
| 45 | |
| 46 | |
| 47 | def _cleanup(self): |
| 48 | logging.info('Running periodic cleanup') |
| 49 | self._abort_timed_out_jobs() |
| 50 | self._abort_jobs_past_synch_start_timeout() |
| 51 | self._clear_inactive_blocks() |
| 52 | self._check_for_db_inconsistencies() |
| 53 | |
| 54 | |
| 55 | def _abort_timed_out_jobs(self): |
| 56 | msg = 'Aborting all jobs that have timed out and are not complete' |
| 57 | logging.info(msg) |
| 58 | query = models.Job.objects.filter(hostqueueentry__complete=False).extra( |
| 59 | where=['created_on + INTERVAL timeout HOUR < NOW()']) |
| 60 | for job in query.distinct(): |
| 61 | logging.warning('Aborting job %d due to job timeout', job.id) |
| 62 | job.abort(None) |
| 63 | |
| 64 | |
| 65 | def _abort_jobs_past_synch_start_timeout(self): |
| 66 | """ |
| 67 | Abort synchronous jobs that are past the start timeout (from global |
| 68 | config) and are holding a machine that's in everyone. |
| 69 | """ |
| 70 | msg = 'Aborting synchronous jobs that are past the start timeout' |
| 71 | logging.info(msg) |
| 72 | timeout_delta = datetime.timedelta( |
| 73 | minutes=scheduler_config.config.synch_job_start_timeout_minutes) |
| 74 | timeout_start = datetime.datetime.now() - timeout_delta |
| 75 | query = models.Job.objects.filter( |
| 76 | created_on__lt=timeout_start, |
| 77 | hostqueueentry__status='Pending', |
| 78 | hostqueueentry__host__aclgroup__name='Everyone') |
| 79 | for job in query.distinct(): |
| 80 | logging.warning('Aborting job %d due to start timeout', job.id) |
| 81 | entries_to_abort = job.hostqueueentry_set.exclude( |
| 82 | status=models.HostQueueEntry.Status.RUNNING) |
| 83 | for queue_entry in entries_to_abort: |
| 84 | queue_entry.abort(None) |
| 85 | |
| 86 | |
| 87 | def _check_for_db_inconsistencies(self): |
| 88 | logging.info('Checking for db inconsistencies') |
| 89 | query = models.HostQueueEntry.objects.filter(active=True, complete=True) |
| 90 | if query.count() != 0: |
| 91 | subject = ('%d queue entries found with active=complete=1' |
| 92 | % query.count()) |
| 93 | message = '\n'.join(str(entry.get_object_dict()) |
| 94 | for entry in query[:50]) |
| 95 | if len(query) > 50: |
| 96 | message += '\n(truncated)\n' |
| 97 | |
| 98 | logging.error(subject) |
| 99 | email_manager.manager.enqueue_notify_email(subject, message) |
| 100 | |
| 101 | |
| 102 | def _clear_inactive_blocks(self): |
| 103 | msg = 'Clear out blocks for all completed jobs.' |
| 104 | logging.info(msg) |
| 105 | # this would be simpler using NOT IN (subquery), but MySQL |
| 106 | # treats all IN subqueries as dependent, so this optimizes much |
| 107 | # better |
| 108 | self._db.execute(""" |
| 109 | DELETE ihq FROM ineligible_host_queues ihq |
| 110 | LEFT JOIN (SELECT DISTINCT job_id FROM host_queue_entries |
| 111 | WHERE NOT complete) hqe |
| 112 | USING (job_id) WHERE hqe.job_id IS NULL""") |
| 113 | |
| 114 | |
| 115 | class TwentyFourHourUpkeep(PeriodicCleanup): |
| 116 | """Cleanup that runs at the startup of monitor_db and every subsequent |
| 117 | twenty four hours. |
| 118 | """ |
| 119 | |
| 120 | |
| 121 | def __init__(self, db, run_at_initialize=True): |
| 122 | clean_interval = 24 * 60 # 24 hours |
| 123 | super(TwentyFourHourUpkeep, self).__init__( |
| 124 | db, clean_interval, run_at_initialize=run_at_initialize) |
| 125 | |
| 126 | |
| 127 | def _cleanup(self): |
| 128 | logging.info('Running 24 hour clean up') |
| 129 | self._django_session_cleanup() |
| 130 | |
| 131 | |
| 132 | def _django_session_cleanup(self): |
| 133 | """Clean up django_session since django doesn't for us. |
| 134 | http://www.djangoproject.com/documentation/0.96/sessions/ |
| 135 | """ |
| 136 | logging.info('Deleting old sessions from django_session') |
| 137 | sql = 'DELETE FROM django_session WHERE expire_date < NOW()' |
| 138 | self._db.execute(sql) |