| mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 1 | """ |
| 2 | Autotest AFE Cleanup used by the scheduler |
| 3 | """ |
| 4 | |
| 5 | |
| 6 | import datetime, time, logging |
| mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 7 | from autotest_lib.database import database_connection |
| 8 | from autotest_lib.frontend.afe import models |
| 9 | from autotest_lib.scheduler import email_manager, scheduler_config |
| showard | 8dbd05a | 2010-01-12 18:54:59 +0000 | [diff] [blame] | 10 | from autotest_lib.client.common_lib import host_protections |
| mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 11 | |
| 12 | |
| 13 | class PeriodicCleanup(object): |
| 14 | |
| 15 | |
| 16 | def __init__(self, db, clean_interval, run_at_initialize=False): |
| 17 | self._db = db |
| 18 | self.clean_interval = clean_interval |
| 19 | self._last_clean_time = time.time() |
| showard | 915958d | 2009-04-22 21:00:58 +0000 | [diff] [blame] | 20 | self._run_at_initialize = run_at_initialize |
| 21 | |
| 22 | |
| 23 | def initialize(self): |
| 24 | if self._run_at_initialize: |
| mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 25 | self._cleanup() |
| 26 | |
| 27 | |
| 28 | def run_cleanup_maybe(self): |
| 29 | should_cleanup = (self._last_clean_time + self.clean_interval * 60 |
| 30 | < time.time()) |
| 31 | if should_cleanup: |
| 32 | self._cleanup() |
| 33 | self._last_clean_time = time.time() |
| 34 | |
| 35 | |
| 36 | def _cleanup(self): |
| 37 | """Abrstract cleanup method.""" |
| 38 | raise NotImplementedError |
| 39 | |
| 40 | |
| 41 | class UserCleanup(PeriodicCleanup): |
| 42 | """User cleanup that is controlled by the global config variable |
| 43 | clean_interval in the SCHEDULER section. |
| 44 | """ |
| 45 | |
| 46 | |
| 47 | def __init__(self, db, clean_interval_minutes): |
| 48 | super(UserCleanup, self).__init__(db, clean_interval_minutes) |
| showard | 8dbd05a | 2010-01-12 18:54:59 +0000 | [diff] [blame] | 49 | self._last_reverify_time = time.time() |
| mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 50 | |
| 51 | |
| 52 | def _cleanup(self): |
| mbligh | 1ef218d | 2009-08-03 16:57:56 +0000 | [diff] [blame] | 53 | logging.info('Running periodic cleanup') |
| 54 | self._abort_timed_out_jobs() |
| mbligh | 1ef218d | 2009-08-03 16:57:56 +0000 | [diff] [blame] | 55 | self._abort_jobs_past_max_runtime() |
| 56 | self._clear_inactive_blocks() |
| 57 | self._check_for_db_inconsistencies() |
| showard | 8dbd05a | 2010-01-12 18:54:59 +0000 | [diff] [blame] | 58 | self._reverify_dead_hosts() |
| mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 59 | |
| 60 | |
| 61 | def _abort_timed_out_jobs(self): |
| 62 | msg = 'Aborting all jobs that have timed out and are not complete' |
| 63 | logging.info(msg) |
| 64 | query = models.Job.objects.filter(hostqueueentry__complete=False).extra( |
| 65 | where=['created_on + INTERVAL timeout HOUR < NOW()']) |
| 66 | for job in query.distinct(): |
| 67 | logging.warning('Aborting job %d due to job timeout', job.id) |
| showard | 64a9595 | 2010-01-13 21:27:16 +0000 | [diff] [blame] | 68 | job.abort() |
| mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 69 | |
| 70 | |
| showard | 12f3e32 | 2009-05-13 21:27:42 +0000 | [diff] [blame] | 71 | def _abort_jobs_past_max_runtime(self): |
| 72 | """ |
| 73 | Abort executions that have started and are past the job's max runtime. |
| 74 | """ |
| 75 | logging.info('Aborting all jobs that have passed maximum runtime') |
| 76 | rows = self._db.execute(""" |
| 77 | SELECT hqe.id |
| showard | eab66ce | 2009-12-23 00:03:56 +0000 | [diff] [blame] | 78 | FROM afe_host_queue_entries AS hqe |
| 79 | INNER JOIN afe_jobs ON (hqe.job_id = afe_jobs.id) |
| showard | 12f3e32 | 2009-05-13 21:27:42 +0000 | [diff] [blame] | 80 | WHERE NOT hqe.complete AND NOT hqe.aborted AND |
| showard | eab66ce | 2009-12-23 00:03:56 +0000 | [diff] [blame] | 81 | hqe.started_on + INTERVAL afe_jobs.max_runtime_hrs HOUR < NOW()""") |
| showard | 12f3e32 | 2009-05-13 21:27:42 +0000 | [diff] [blame] | 82 | query = models.HostQueueEntry.objects.filter( |
| 83 | id__in=[row[0] for row in rows]) |
| 84 | for queue_entry in query.distinct(): |
| 85 | logging.warning('Aborting entry %s due to max runtime', queue_entry) |
| showard | 64a9595 | 2010-01-13 21:27:16 +0000 | [diff] [blame] | 86 | queue_entry.abort() |
| showard | 12f3e32 | 2009-05-13 21:27:42 +0000 | [diff] [blame] | 87 | |
| 88 | |
| mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 89 | def _check_for_db_inconsistencies(self): |
| showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 90 | logging.info('Cleaning db inconsistencies') |
| 91 | self._check_all_invalid_related_objects() |
| mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 92 | |
| showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 93 | |
| 94 | def _check_invalid_related_objects_one_way(self, first_model, |
| 95 | relation_field, second_model): |
| 96 | if 'invalid' not in first_model.get_field_dict(): |
| 97 | return [] |
| 98 | invalid_objects = list(first_model.objects.filter(invalid=True)) |
| 99 | first_model.objects.populate_relationships(invalid_objects, |
| 100 | second_model, |
| 101 | 'related_objects') |
| 102 | error_lines = [] |
| 103 | for invalid_object in invalid_objects: |
| 104 | if invalid_object.related_objects: |
| 105 | related_list = ', '.join(str(related_object) for related_object |
| 106 | in invalid_object.related_objects) |
| 107 | error_lines.append('Invalid %s %s is related to %ss: %s' |
| 108 | % (first_model.__name__, invalid_object, |
| 109 | second_model.__name__, related_list)) |
| 110 | related_manager = getattr(invalid_object, relation_field) |
| 111 | related_manager.clear() |
| 112 | return error_lines |
| 113 | |
| 114 | |
| 115 | def _check_invalid_related_objects(self, first_model, first_field, |
| 116 | second_model, second_field): |
| 117 | errors = self._check_invalid_related_objects_one_way( |
| 118 | first_model, first_field, second_model) |
| 119 | errors.extend(self._check_invalid_related_objects_one_way( |
| 120 | second_model, second_field, first_model)) |
| 121 | return errors |
| 122 | |
| 123 | |
| 124 | def _check_all_invalid_related_objects(self): |
| 125 | model_pairs = ((models.Host, 'labels', models.Label, 'host_set'), |
| 126 | (models.AclGroup, 'hosts', models.Host, 'aclgroup_set'), |
| 127 | (models.AclGroup, 'users', models.User, 'aclgroup_set'), |
| 128 | (models.Test, 'dependency_labels', models.Label, |
| 129 | 'test_set')) |
| 130 | errors = [] |
| 131 | for first_model, first_field, second_model, second_field in model_pairs: |
| 132 | errors.extend(self._check_invalid_related_objects( |
| 133 | first_model, first_field, second_model, second_field)) |
| 134 | |
| 135 | if errors: |
| 136 | subject = ('%s relationships to invalid models, cleaned all' % |
| 137 | len(errors)) |
| 138 | message = '\n'.join(errors) |
| 139 | logging.warning(subject) |
| 140 | logging.warning(message) |
| mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 141 | email_manager.manager.enqueue_notify_email(subject, message) |
| 142 | |
| 143 | |
| 144 | def _clear_inactive_blocks(self): |
| 145 | msg = 'Clear out blocks for all completed jobs.' |
| 146 | logging.info(msg) |
| 147 | # this would be simpler using NOT IN (subquery), but MySQL |
| 148 | # treats all IN subqueries as dependent, so this optimizes much |
| 149 | # better |
| 150 | self._db.execute(""" |
| showard | eab66ce | 2009-12-23 00:03:56 +0000 | [diff] [blame] | 151 | DELETE ihq FROM afe_ineligible_host_queues ihq |
| 152 | LEFT JOIN (SELECT DISTINCT job_id FROM afe_host_queue_entries |
| mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 153 | WHERE NOT complete) hqe |
| 154 | USING (job_id) WHERE hqe.job_id IS NULL""") |
| 155 | |
| 156 | |
| showard | 8dbd05a | 2010-01-12 18:54:59 +0000 | [diff] [blame] | 157 | def _should_reverify_hosts_now(self): |
| 158 | reverify_period_sec = (scheduler_config.config.reverify_period_minutes |
| 159 | * 60) |
| 160 | if reverify_period_sec == 0: |
| 161 | return False |
| 162 | return (self._last_reverify_time + reverify_period_sec) <= time.time() |
| 163 | |
| 164 | |
| 165 | def _reverify_dead_hosts(self): |
| 166 | if not self._should_reverify_hosts_now(): |
| 167 | return |
| 168 | |
| 169 | self._last_reverify_time = time.time() |
| 170 | logging.info('Checking for dead hosts to reverify') |
| 171 | hosts = models.Host.objects.filter( |
| 172 | status=models.Host.Status.REPAIR_FAILED, |
| 173 | locked=False, |
| 174 | invalid=False) |
| 175 | hosts = hosts.exclude( |
| 176 | protection=host_protections.Protection.DO_NOT_VERIFY) |
| 177 | if not hosts: |
| 178 | return |
| 179 | |
| 180 | logging.info('Reverifying dead hosts %s' |
| 181 | % ', '.join(host.hostname for host in hosts)) |
| 182 | for host in hosts: |
| showard | be030fb | 2010-01-15 00:21:20 +0000 | [diff] [blame] | 183 | models.SpecialTask.schedule_special_task( |
| showard | 8dbd05a | 2010-01-12 18:54:59 +0000 | [diff] [blame] | 184 | host=host, task=models.SpecialTask.Task.VERIFY) |
| 185 | |
| 186 | |
| mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 187 | class TwentyFourHourUpkeep(PeriodicCleanup): |
| 188 | """Cleanup that runs at the startup of monitor_db and every subsequent |
| 189 | twenty four hours. |
| 190 | """ |
| 191 | |
| 192 | |
| 193 | def __init__(self, db, run_at_initialize=True): |
| 194 | clean_interval = 24 * 60 # 24 hours |
| 195 | super(TwentyFourHourUpkeep, self).__init__( |
| 196 | db, clean_interval, run_at_initialize=run_at_initialize) |
| 197 | |
| 198 | |
| 199 | def _cleanup(self): |
| 200 | logging.info('Running 24 hour clean up') |
| 201 | self._django_session_cleanup() |
| showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 202 | self._check_for_uncleanable_db_inconsistencies() |
| mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 203 | |
| 204 | |
| 205 | def _django_session_cleanup(self): |
| 206 | """Clean up django_session since django doesn't for us. |
| 207 | http://www.djangoproject.com/documentation/0.96/sessions/ |
| 208 | """ |
| 209 | logging.info('Deleting old sessions from django_session') |
| 210 | sql = 'DELETE FROM django_session WHERE expire_date < NOW()' |
| 211 | self._db.execute(sql) |
| showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 212 | |
| 213 | |
| 214 | def _check_for_uncleanable_db_inconsistencies(self): |
| 215 | logging.info('Checking for uncleanable DB inconsistencies') |
| 216 | self._check_for_active_and_complete_queue_entries() |
| 217 | self._check_for_multiple_platform_hosts() |
| 218 | self._check_for_no_platform_hosts() |
| showard | 6157c63 | 2009-07-06 20:19:31 +0000 | [diff] [blame] | 219 | self._check_for_multiple_atomic_group_hosts() |
| showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 220 | |
| 221 | |
| 222 | def _check_for_active_and_complete_queue_entries(self): |
| 223 | query = models.HostQueueEntry.objects.filter(active=True, complete=True) |
| 224 | if query.count() != 0: |
| 225 | subject = ('%d queue entries found with active=complete=1' |
| 226 | % query.count()) |
| 227 | lines = [str(entry.get_object_dict()) for entry in query] |
| 228 | self._send_inconsistency_message(subject, lines) |
| 229 | |
| 230 | |
| 231 | def _check_for_multiple_platform_hosts(self): |
| 232 | rows = self._db.execute(""" |
| showard | eab66ce | 2009-12-23 00:03:56 +0000 | [diff] [blame] | 233 | SELECT afe_hosts.id, hostname, COUNT(1) AS platform_count, |
| 234 | GROUP_CONCAT(afe_labels.name) |
| 235 | FROM afe_hosts |
| 236 | INNER JOIN afe_hosts_labels ON |
| 237 | afe_hosts.id = afe_hosts_labels.host_id |
| 238 | INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id |
| 239 | WHERE afe_labels.platform |
| 240 | GROUP BY afe_hosts.id |
| showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 241 | HAVING platform_count > 1 |
| 242 | ORDER BY hostname""") |
| 243 | if rows: |
| 244 | subject = '%s hosts with multiple platforms' % self._db.rowcount |
| 245 | lines = [' '.join(str(item) for item in row) |
| 246 | for row in rows] |
| 247 | self._send_inconsistency_message(subject, lines) |
| 248 | |
| 249 | |
| 250 | def _check_for_no_platform_hosts(self): |
| 251 | rows = self._db.execute(""" |
| 252 | SELECT hostname |
| showard | eab66ce | 2009-12-23 00:03:56 +0000 | [diff] [blame] | 253 | FROM afe_hosts |
| 254 | LEFT JOIN afe_hosts_labels |
| 255 | ON afe_hosts.id = afe_hosts_labels.host_id |
| 256 | AND afe_hosts_labels.label_id IN (SELECT id FROM afe_labels |
| 257 | WHERE platform) |
| 258 | WHERE NOT afe_hosts.invalid AND afe_hosts_labels.host_id IS NULL""") |
| showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 259 | if rows: |
| jamesren | 675bfe7 | 2010-02-19 21:56:13 +0000 | [diff] [blame^] | 260 | logging.warn('%s hosts with no platform\n%s', self._db.rowcount, |
| 261 | ', '.join(row[0] for row in rows)) |
| showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 262 | |
| 263 | |
| showard | 6157c63 | 2009-07-06 20:19:31 +0000 | [diff] [blame] | 264 | def _check_for_multiple_atomic_group_hosts(self): |
| 265 | rows = self._db.execute(""" |
| showard | eab66ce | 2009-12-23 00:03:56 +0000 | [diff] [blame] | 266 | SELECT afe_hosts.id, hostname, |
| 267 | COUNT(DISTINCT afe_atomic_groups.name) AS atomic_group_count, |
| 268 | GROUP_CONCAT(afe_labels.name), |
| 269 | GROUP_CONCAT(afe_atomic_groups.name) |
| 270 | FROM afe_hosts |
| 271 | INNER JOIN afe_hosts_labels ON |
| 272 | afe_hosts.id = afe_hosts_labels.host_id |
| 273 | INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id |
| 274 | INNER JOIN afe_atomic_groups ON |
| 275 | afe_labels.atomic_group_id = afe_atomic_groups.id |
| 276 | WHERE NOT afe_hosts.invalid AND NOT afe_labels.invalid |
| 277 | GROUP BY afe_hosts.id |
| showard | 6157c63 | 2009-07-06 20:19:31 +0000 | [diff] [blame] | 278 | HAVING atomic_group_count > 1 |
| 279 | ORDER BY hostname""") |
| 280 | if rows: |
| 281 | subject = '%s hosts with multiple atomic groups' % self._db.rowcount |
| 282 | lines = [' '.join(str(item) for item in row) |
| 283 | for row in rows] |
| 284 | self._send_inconsistency_message(subject, lines) |
| 285 | |
| 286 | |
| showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 287 | def _send_inconsistency_message(self, subject, lines): |
| 288 | logging.error(subject) |
| 289 | message = '\n'.join(lines) |
| 290 | if len(message) > 5000: |
| 291 | message = message[:5000] + '\n(truncated)\n' |
| 292 | email_manager.manager.enqueue_notify_email(subject, message) |