mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 1 | """ |
| 2 | Autotest AFE Cleanup used by the scheduler |
| 3 | """ |
| 4 | |
| 5 | |
Eric Li | e0493a4 | 2010-11-15 13:05:43 -0800 | [diff] [blame] | 6 | import datetime, time, logging, random |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 7 | from autotest_lib.database import database_connection |
| 8 | from autotest_lib.frontend.afe import models |
| 9 | from autotest_lib.scheduler import email_manager, scheduler_config |
showard | 8dbd05a | 2010-01-12 18:54:59 +0000 | [diff] [blame] | 10 | from autotest_lib.client.common_lib import host_protections |
Alex Miller | ee63291 | 2013-10-08 16:03:12 -0700 | [diff] [blame] | 11 | from autotest_lib.site_utils.graphite import stats |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 12 | |
| 13 | |
| 14 | class PeriodicCleanup(object): |
| 15 | |
| 16 | |
| 17 | def __init__(self, db, clean_interval, run_at_initialize=False): |
| 18 | self._db = db |
| 19 | self.clean_interval = clean_interval |
| 20 | self._last_clean_time = time.time() |
showard | 915958d | 2009-04-22 21:00:58 +0000 | [diff] [blame] | 21 | self._run_at_initialize = run_at_initialize |
| 22 | |
| 23 | |
| 24 | def initialize(self): |
| 25 | if self._run_at_initialize: |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 26 | self._cleanup() |
| 27 | |
| 28 | |
| 29 | def run_cleanup_maybe(self): |
| 30 | should_cleanup = (self._last_clean_time + self.clean_interval * 60 |
| 31 | < time.time()) |
| 32 | if should_cleanup: |
| 33 | self._cleanup() |
| 34 | self._last_clean_time = time.time() |
| 35 | |
| 36 | |
| 37 | def _cleanup(self): |
| 38 | """Abrstract cleanup method.""" |
| 39 | raise NotImplementedError |
| 40 | |
| 41 | |
| 42 | class UserCleanup(PeriodicCleanup): |
| 43 | """User cleanup that is controlled by the global config variable |
| 44 | clean_interval in the SCHEDULER section. |
| 45 | """ |
Alex Miller | ee63291 | 2013-10-08 16:03:12 -0700 | [diff] [blame] | 46 | timer = stats.Timer('monitor_db_cleanup.user_cleanup') |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 47 | |
| 48 | |
| 49 | def __init__(self, db, clean_interval_minutes): |
| 50 | super(UserCleanup, self).__init__(db, clean_interval_minutes) |
showard | 8dbd05a | 2010-01-12 18:54:59 +0000 | [diff] [blame] | 51 | self._last_reverify_time = time.time() |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 52 | |
| 53 | |
Alex Miller | ee63291 | 2013-10-08 16:03:12 -0700 | [diff] [blame] | 54 | @timer.decorate |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 55 | def _cleanup(self): |
mbligh | 1ef218d | 2009-08-03 16:57:56 +0000 | [diff] [blame] | 56 | logging.info('Running periodic cleanup') |
| 57 | self._abort_timed_out_jobs() |
mbligh | 1ef218d | 2009-08-03 16:57:56 +0000 | [diff] [blame] | 58 | self._abort_jobs_past_max_runtime() |
| 59 | self._clear_inactive_blocks() |
| 60 | self._check_for_db_inconsistencies() |
showard | 8dbd05a | 2010-01-12 18:54:59 +0000 | [diff] [blame] | 61 | self._reverify_dead_hosts() |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 62 | |
| 63 | |
Alex Miller | ee63291 | 2013-10-08 16:03:12 -0700 | [diff] [blame] | 64 | @timer.decorate |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 65 | def _abort_timed_out_jobs(self): |
| 66 | msg = 'Aborting all jobs that have timed out and are not complete' |
| 67 | logging.info(msg) |
| 68 | query = models.Job.objects.filter(hostqueueentry__complete=False).extra( |
Simran Basi | 7e60574 | 2013-11-12 13:43:36 -0800 | [diff] [blame] | 69 | where=['created_on + INTERVAL timeout_mins MINUTE < NOW()']) |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 70 | for job in query.distinct(): |
| 71 | logging.warning('Aborting job %d due to job timeout', job.id) |
showard | 64a9595 | 2010-01-13 21:27:16 +0000 | [diff] [blame] | 72 | job.abort() |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 73 | |
| 74 | |
Alex Miller | ee63291 | 2013-10-08 16:03:12 -0700 | [diff] [blame] | 75 | @timer.decorate |
showard | 12f3e32 | 2009-05-13 21:27:42 +0000 | [diff] [blame] | 76 | def _abort_jobs_past_max_runtime(self): |
| 77 | """ |
| 78 | Abort executions that have started and are past the job's max runtime. |
| 79 | """ |
| 80 | logging.info('Aborting all jobs that have passed maximum runtime') |
| 81 | rows = self._db.execute(""" |
| 82 | SELECT hqe.id |
showard | eab66ce | 2009-12-23 00:03:56 +0000 | [diff] [blame] | 83 | FROM afe_host_queue_entries AS hqe |
| 84 | INNER JOIN afe_jobs ON (hqe.job_id = afe_jobs.id) |
showard | 12f3e32 | 2009-05-13 21:27:42 +0000 | [diff] [blame] | 85 | WHERE NOT hqe.complete AND NOT hqe.aborted AND |
Simran Basi | 3421702 | 2012-11-06 13:43:15 -0800 | [diff] [blame] | 86 | hqe.started_on + INTERVAL afe_jobs.max_runtime_mins MINUTE < |
| 87 | NOW()""") |
showard | 12f3e32 | 2009-05-13 21:27:42 +0000 | [diff] [blame] | 88 | query = models.HostQueueEntry.objects.filter( |
| 89 | id__in=[row[0] for row in rows]) |
| 90 | for queue_entry in query.distinct(): |
| 91 | logging.warning('Aborting entry %s due to max runtime', queue_entry) |
showard | 64a9595 | 2010-01-13 21:27:16 +0000 | [diff] [blame] | 92 | queue_entry.abort() |
showard | 12f3e32 | 2009-05-13 21:27:42 +0000 | [diff] [blame] | 93 | |
| 94 | |
Alex Miller | ee63291 | 2013-10-08 16:03:12 -0700 | [diff] [blame] | 95 | @timer.decorate |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 96 | def _check_for_db_inconsistencies(self): |
showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 97 | logging.info('Cleaning db inconsistencies') |
| 98 | self._check_all_invalid_related_objects() |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 99 | |
showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 100 | |
| 101 | def _check_invalid_related_objects_one_way(self, first_model, |
| 102 | relation_field, second_model): |
| 103 | if 'invalid' not in first_model.get_field_dict(): |
| 104 | return [] |
| 105 | invalid_objects = list(first_model.objects.filter(invalid=True)) |
| 106 | first_model.objects.populate_relationships(invalid_objects, |
| 107 | second_model, |
| 108 | 'related_objects') |
| 109 | error_lines = [] |
| 110 | for invalid_object in invalid_objects: |
| 111 | if invalid_object.related_objects: |
| 112 | related_list = ', '.join(str(related_object) for related_object |
| 113 | in invalid_object.related_objects) |
| 114 | error_lines.append('Invalid %s %s is related to %ss: %s' |
| 115 | % (first_model.__name__, invalid_object, |
| 116 | second_model.__name__, related_list)) |
| 117 | related_manager = getattr(invalid_object, relation_field) |
| 118 | related_manager.clear() |
| 119 | return error_lines |
| 120 | |
| 121 | |
| 122 | def _check_invalid_related_objects(self, first_model, first_field, |
| 123 | second_model, second_field): |
| 124 | errors = self._check_invalid_related_objects_one_way( |
| 125 | first_model, first_field, second_model) |
| 126 | errors.extend(self._check_invalid_related_objects_one_way( |
| 127 | second_model, second_field, first_model)) |
| 128 | return errors |
| 129 | |
| 130 | |
| 131 | def _check_all_invalid_related_objects(self): |
| 132 | model_pairs = ((models.Host, 'labels', models.Label, 'host_set'), |
| 133 | (models.AclGroup, 'hosts', models.Host, 'aclgroup_set'), |
| 134 | (models.AclGroup, 'users', models.User, 'aclgroup_set'), |
| 135 | (models.Test, 'dependency_labels', models.Label, |
| 136 | 'test_set')) |
| 137 | errors = [] |
| 138 | for first_model, first_field, second_model, second_field in model_pairs: |
| 139 | errors.extend(self._check_invalid_related_objects( |
| 140 | first_model, first_field, second_model, second_field)) |
| 141 | |
| 142 | if errors: |
| 143 | subject = ('%s relationships to invalid models, cleaned all' % |
| 144 | len(errors)) |
| 145 | message = '\n'.join(errors) |
| 146 | logging.warning(subject) |
| 147 | logging.warning(message) |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 148 | email_manager.manager.enqueue_notify_email(subject, message) |
| 149 | |
| 150 | |
Alex Miller | ee63291 | 2013-10-08 16:03:12 -0700 | [diff] [blame] | 151 | @timer.decorate |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 152 | def _clear_inactive_blocks(self): |
| 153 | msg = 'Clear out blocks for all completed jobs.' |
| 154 | logging.info(msg) |
| 155 | # this would be simpler using NOT IN (subquery), but MySQL |
| 156 | # treats all IN subqueries as dependent, so this optimizes much |
| 157 | # better |
| 158 | self._db.execute(""" |
showard | eab66ce | 2009-12-23 00:03:56 +0000 | [diff] [blame] | 159 | DELETE ihq FROM afe_ineligible_host_queues ihq |
| 160 | LEFT JOIN (SELECT DISTINCT job_id FROM afe_host_queue_entries |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 161 | WHERE NOT complete) hqe |
| 162 | USING (job_id) WHERE hqe.job_id IS NULL""") |
| 163 | |
| 164 | |
showard | 8dbd05a | 2010-01-12 18:54:59 +0000 | [diff] [blame] | 165 | def _should_reverify_hosts_now(self): |
| 166 | reverify_period_sec = (scheduler_config.config.reverify_period_minutes |
| 167 | * 60) |
| 168 | if reverify_period_sec == 0: |
| 169 | return False |
| 170 | return (self._last_reverify_time + reverify_period_sec) <= time.time() |
| 171 | |
| 172 | |
Eric Li | e0493a4 | 2010-11-15 13:05:43 -0800 | [diff] [blame] | 173 | def _choose_subset_of_hosts_to_reverify(self, hosts): |
| 174 | """Given hosts needing verification, return a subset to reverify.""" |
| 175 | max_at_once = scheduler_config.config.reverify_max_hosts_at_once |
| 176 | if (max_at_once > 0 and len(hosts) > max_at_once): |
| 177 | return random.sample(hosts, max_at_once) |
| 178 | return sorted(hosts) |
| 179 | |
| 180 | |
Alex Miller | ee63291 | 2013-10-08 16:03:12 -0700 | [diff] [blame] | 181 | @timer.decorate |
showard | 8dbd05a | 2010-01-12 18:54:59 +0000 | [diff] [blame] | 182 | def _reverify_dead_hosts(self): |
| 183 | if not self._should_reverify_hosts_now(): |
| 184 | return |
| 185 | |
| 186 | self._last_reverify_time = time.time() |
| 187 | logging.info('Checking for dead hosts to reverify') |
| 188 | hosts = models.Host.objects.filter( |
| 189 | status=models.Host.Status.REPAIR_FAILED, |
| 190 | locked=False, |
| 191 | invalid=False) |
| 192 | hosts = hosts.exclude( |
| 193 | protection=host_protections.Protection.DO_NOT_VERIFY) |
| 194 | if not hosts: |
| 195 | return |
| 196 | |
Eric Li | e0493a4 | 2010-11-15 13:05:43 -0800 | [diff] [blame] | 197 | hosts = list(hosts) |
| 198 | total_hosts = len(hosts) |
| 199 | hosts = self._choose_subset_of_hosts_to_reverify(hosts) |
| 200 | logging.info('Reverifying dead hosts (%d of %d) %s', len(hosts), |
| 201 | total_hosts, ', '.join(host.hostname for host in hosts)) |
showard | 8dbd05a | 2010-01-12 18:54:59 +0000 | [diff] [blame] | 202 | for host in hosts: |
showard | be030fb | 2010-01-15 00:21:20 +0000 | [diff] [blame] | 203 | models.SpecialTask.schedule_special_task( |
showard | 8dbd05a | 2010-01-12 18:54:59 +0000 | [diff] [blame] | 204 | host=host, task=models.SpecialTask.Task.VERIFY) |
| 205 | |
| 206 | |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 207 | class TwentyFourHourUpkeep(PeriodicCleanup): |
| 208 | """Cleanup that runs at the startup of monitor_db and every subsequent |
| 209 | twenty four hours. |
| 210 | """ |
Alex Miller | ee63291 | 2013-10-08 16:03:12 -0700 | [diff] [blame] | 211 | timer = stats.Timer('monitor_db_cleanup.twentyfourhour_cleanup') |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 212 | |
| 213 | |
| 214 | def __init__(self, db, run_at_initialize=True): |
| 215 | clean_interval = 24 * 60 # 24 hours |
| 216 | super(TwentyFourHourUpkeep, self).__init__( |
| 217 | db, clean_interval, run_at_initialize=run_at_initialize) |
| 218 | |
| 219 | |
Alex Miller | ee63291 | 2013-10-08 16:03:12 -0700 | [diff] [blame] | 220 | @timer.decorate |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 221 | def _cleanup(self): |
| 222 | logging.info('Running 24 hour clean up') |
| 223 | self._django_session_cleanup() |
showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 224 | self._check_for_uncleanable_db_inconsistencies() |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 225 | |
| 226 | |
Alex Miller | ee63291 | 2013-10-08 16:03:12 -0700 | [diff] [blame] | 227 | @timer.decorate |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 228 | def _django_session_cleanup(self): |
| 229 | """Clean up django_session since django doesn't for us. |
| 230 | http://www.djangoproject.com/documentation/0.96/sessions/ |
| 231 | """ |
| 232 | logging.info('Deleting old sessions from django_session') |
Scott Zawalski | 5649cff | 2012-08-13 14:48:04 -0400 | [diff] [blame] | 233 | sql = 'TRUNCATE TABLE django_session' |
mbligh | f3294cc | 2009-04-08 21:17:38 +0000 | [diff] [blame] | 234 | self._db.execute(sql) |
showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 235 | |
| 236 | |
Alex Miller | ee63291 | 2013-10-08 16:03:12 -0700 | [diff] [blame] | 237 | @timer.decorate |
showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 238 | def _check_for_uncleanable_db_inconsistencies(self): |
| 239 | logging.info('Checking for uncleanable DB inconsistencies') |
| 240 | self._check_for_active_and_complete_queue_entries() |
| 241 | self._check_for_multiple_platform_hosts() |
| 242 | self._check_for_no_platform_hosts() |
showard | 6157c63 | 2009-07-06 20:19:31 +0000 | [diff] [blame] | 243 | self._check_for_multiple_atomic_group_hosts() |
showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 244 | |
| 245 | |
Alex Miller | ee63291 | 2013-10-08 16:03:12 -0700 | [diff] [blame] | 246 | @timer.decorate |
showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 247 | def _check_for_active_and_complete_queue_entries(self): |
| 248 | query = models.HostQueueEntry.objects.filter(active=True, complete=True) |
| 249 | if query.count() != 0: |
| 250 | subject = ('%d queue entries found with active=complete=1' |
| 251 | % query.count()) |
Simran Basi | 1c5b057 | 2012-10-11 11:27:51 -0700 | [diff] [blame] | 252 | lines = [] |
| 253 | for entry in query: |
| 254 | lines.append(str(entry.get_object_dict())) |
| 255 | if entry.status == 'Aborted': |
| 256 | logging.error('Aborted entry: %s is both active and ' |
| 257 | 'complete. Setting active value to False.', |
| 258 | str(entry)) |
| 259 | entry.active = False |
| 260 | entry.save() |
showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 261 | self._send_inconsistency_message(subject, lines) |
| 262 | |
| 263 | |
Alex Miller | ee63291 | 2013-10-08 16:03:12 -0700 | [diff] [blame] | 264 | @timer.decorate |
showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 265 | def _check_for_multiple_platform_hosts(self): |
| 266 | rows = self._db.execute(""" |
showard | eab66ce | 2009-12-23 00:03:56 +0000 | [diff] [blame] | 267 | SELECT afe_hosts.id, hostname, COUNT(1) AS platform_count, |
| 268 | GROUP_CONCAT(afe_labels.name) |
| 269 | FROM afe_hosts |
| 270 | INNER JOIN afe_hosts_labels ON |
| 271 | afe_hosts.id = afe_hosts_labels.host_id |
| 272 | INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id |
| 273 | WHERE afe_labels.platform |
| 274 | GROUP BY afe_hosts.id |
showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 275 | HAVING platform_count > 1 |
| 276 | ORDER BY hostname""") |
| 277 | if rows: |
| 278 | subject = '%s hosts with multiple platforms' % self._db.rowcount |
| 279 | lines = [' '.join(str(item) for item in row) |
| 280 | for row in rows] |
| 281 | self._send_inconsistency_message(subject, lines) |
| 282 | |
| 283 | |
Alex Miller | ee63291 | 2013-10-08 16:03:12 -0700 | [diff] [blame] | 284 | @timer.decorate |
showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 285 | def _check_for_no_platform_hosts(self): |
| 286 | rows = self._db.execute(""" |
| 287 | SELECT hostname |
showard | eab66ce | 2009-12-23 00:03:56 +0000 | [diff] [blame] | 288 | FROM afe_hosts |
| 289 | LEFT JOIN afe_hosts_labels |
| 290 | ON afe_hosts.id = afe_hosts_labels.host_id |
| 291 | AND afe_hosts_labels.label_id IN (SELECT id FROM afe_labels |
| 292 | WHERE platform) |
| 293 | WHERE NOT afe_hosts.invalid AND afe_hosts_labels.host_id IS NULL""") |
showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 294 | if rows: |
jamesren | 675bfe7 | 2010-02-19 21:56:13 +0000 | [diff] [blame] | 295 | logging.warn('%s hosts with no platform\n%s', self._db.rowcount, |
| 296 | ', '.join(row[0] for row in rows)) |
showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 297 | |
| 298 | |
Alex Miller | ee63291 | 2013-10-08 16:03:12 -0700 | [diff] [blame] | 299 | @timer.decorate |
showard | 6157c63 | 2009-07-06 20:19:31 +0000 | [diff] [blame] | 300 | def _check_for_multiple_atomic_group_hosts(self): |
| 301 | rows = self._db.execute(""" |
showard | eab66ce | 2009-12-23 00:03:56 +0000 | [diff] [blame] | 302 | SELECT afe_hosts.id, hostname, |
| 303 | COUNT(DISTINCT afe_atomic_groups.name) AS atomic_group_count, |
| 304 | GROUP_CONCAT(afe_labels.name), |
| 305 | GROUP_CONCAT(afe_atomic_groups.name) |
| 306 | FROM afe_hosts |
| 307 | INNER JOIN afe_hosts_labels ON |
| 308 | afe_hosts.id = afe_hosts_labels.host_id |
| 309 | INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id |
| 310 | INNER JOIN afe_atomic_groups ON |
| 311 | afe_labels.atomic_group_id = afe_atomic_groups.id |
| 312 | WHERE NOT afe_hosts.invalid AND NOT afe_labels.invalid |
| 313 | GROUP BY afe_hosts.id |
showard | 6157c63 | 2009-07-06 20:19:31 +0000 | [diff] [blame] | 314 | HAVING atomic_group_count > 1 |
| 315 | ORDER BY hostname""") |
| 316 | if rows: |
| 317 | subject = '%s hosts with multiple atomic groups' % self._db.rowcount |
| 318 | lines = [' '.join(str(item) for item in row) |
| 319 | for row in rows] |
| 320 | self._send_inconsistency_message(subject, lines) |
| 321 | |
| 322 | |
showard | 01a5167 | 2009-05-29 18:42:37 +0000 | [diff] [blame] | 323 | def _send_inconsistency_message(self, subject, lines): |
| 324 | logging.error(subject) |
| 325 | message = '\n'.join(lines) |
| 326 | if len(message) > 5000: |
| 327 | message = message[:5000] + '\n(truncated)\n' |
| 328 | email_manager.manager.enqueue_notify_email(subject, message) |