blob: 02b7eb26fb8cd55568cd796496d6ff1fa778ccc1 [file] [log] [blame]
mblighf3294cc2009-04-08 21:17:38 +00001"""
2Autotest AFE Cleanup used by the scheduler
3"""
4
5
Prashanth B372613d2014-05-05 08:40:21 -07006import collections, datetime, time, logging, random
mblighf3294cc2009-04-08 21:17:38 +00007from autotest_lib.database import database_connection
8from autotest_lib.frontend.afe import models
9from autotest_lib.scheduler import email_manager, scheduler_config
showard8dbd05a2010-01-12 18:54:59 +000010from autotest_lib.client.common_lib import host_protections
Alex Milleree632912013-10-08 16:03:12 -070011from autotest_lib.site_utils.graphite import stats
mblighf3294cc2009-04-08 21:17:38 +000012
13
14class PeriodicCleanup(object):
15
16
17 def __init__(self, db, clean_interval, run_at_initialize=False):
18 self._db = db
19 self.clean_interval = clean_interval
20 self._last_clean_time = time.time()
showard915958d2009-04-22 21:00:58 +000021 self._run_at_initialize = run_at_initialize
22
23
24 def initialize(self):
25 if self._run_at_initialize:
mblighf3294cc2009-04-08 21:17:38 +000026 self._cleanup()
27
28
29 def run_cleanup_maybe(self):
30 should_cleanup = (self._last_clean_time + self.clean_interval * 60
31 < time.time())
32 if should_cleanup:
33 self._cleanup()
34 self._last_clean_time = time.time()
35
36
37 def _cleanup(self):
38 """Abrstract cleanup method."""
39 raise NotImplementedError
40
41
42class UserCleanup(PeriodicCleanup):
43 """User cleanup that is controlled by the global config variable
44 clean_interval in the SCHEDULER section.
45 """
Alex Milleree632912013-10-08 16:03:12 -070046 timer = stats.Timer('monitor_db_cleanup.user_cleanup')
mblighf3294cc2009-04-08 21:17:38 +000047
48
49 def __init__(self, db, clean_interval_minutes):
50 super(UserCleanup, self).__init__(db, clean_interval_minutes)
showard8dbd05a2010-01-12 18:54:59 +000051 self._last_reverify_time = time.time()
mblighf3294cc2009-04-08 21:17:38 +000052
53
Alex Milleree632912013-10-08 16:03:12 -070054 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +000055 def _cleanup(self):
mbligh1ef218d2009-08-03 16:57:56 +000056 logging.info('Running periodic cleanup')
57 self._abort_timed_out_jobs()
mbligh1ef218d2009-08-03 16:57:56 +000058 self._abort_jobs_past_max_runtime()
59 self._clear_inactive_blocks()
60 self._check_for_db_inconsistencies()
Prashanth B372613d2014-05-05 08:40:21 -070061 self._check_host_assignments()
showard8dbd05a2010-01-12 18:54:59 +000062 self._reverify_dead_hosts()
mblighf3294cc2009-04-08 21:17:38 +000063
64
Alex Milleree632912013-10-08 16:03:12 -070065 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +000066 def _abort_timed_out_jobs(self):
67 msg = 'Aborting all jobs that have timed out and are not complete'
68 logging.info(msg)
69 query = models.Job.objects.filter(hostqueueentry__complete=False).extra(
Simran Basi7e605742013-11-12 13:43:36 -080070 where=['created_on + INTERVAL timeout_mins MINUTE < NOW()'])
mblighf3294cc2009-04-08 21:17:38 +000071 for job in query.distinct():
72 logging.warning('Aborting job %d due to job timeout', job.id)
showard64a95952010-01-13 21:27:16 +000073 job.abort()
mblighf3294cc2009-04-08 21:17:38 +000074
75
Alex Milleree632912013-10-08 16:03:12 -070076 @timer.decorate
showard12f3e322009-05-13 21:27:42 +000077 def _abort_jobs_past_max_runtime(self):
78 """
79 Abort executions that have started and are past the job's max runtime.
80 """
81 logging.info('Aborting all jobs that have passed maximum runtime')
82 rows = self._db.execute("""
83 SELECT hqe.id
showardeab66ce2009-12-23 00:03:56 +000084 FROM afe_host_queue_entries AS hqe
85 INNER JOIN afe_jobs ON (hqe.job_id = afe_jobs.id)
showard12f3e322009-05-13 21:27:42 +000086 WHERE NOT hqe.complete AND NOT hqe.aborted AND
Simran Basi34217022012-11-06 13:43:15 -080087 hqe.started_on + INTERVAL afe_jobs.max_runtime_mins MINUTE <
88 NOW()""")
showard12f3e322009-05-13 21:27:42 +000089 query = models.HostQueueEntry.objects.filter(
90 id__in=[row[0] for row in rows])
91 for queue_entry in query.distinct():
92 logging.warning('Aborting entry %s due to max runtime', queue_entry)
showard64a95952010-01-13 21:27:16 +000093 queue_entry.abort()
showard12f3e322009-05-13 21:27:42 +000094
95
Prashanth B372613d2014-05-05 08:40:21 -070096 @classmethod
97 def get_overlapping_jobs(cls):
98 """A helper method to get all active jobs using the same host.
99
100 @return: A list of dictionaries with the hqe id, job_id and host_id
101 of the currently overlapping jobs.
102 """
103 # Filter all active hqes and stand alone special tasks to make sure
104 # a host isn't being used by two jobs at the same time. An incomplete
105 # stand alone special task can share a host with an active hqe, an
106 # example of this is the cleanup scheduled in gathering.
107 hqe_hosts = list(models.HostQueueEntry.objects.filter(
108 active=1, complete=0, host_id__isnull=False).values_list(
109 'host_id', flat=True))
110 special_task_hosts = list(models.SpecialTask.objects.filter(
111 is_active=1, is_complete=0, host_id__isnull=False,
112 queue_entry_id__isnull=True).values_list('host_id', flat=True))
113 host_counts = collections.Counter(
114 hqe_hosts + special_task_hosts).most_common()
115 multiple_hosts = [count[0] for count in host_counts if count[1] > 1]
116 return list(models.HostQueueEntry.objects.filter(
117 host_id__in=multiple_hosts).values('id', 'job_id', 'host_id'))
118
119
120 @timer.decorate
121 def _check_host_assignments(self):
122 """Sanity check the current host assignments."""
123 message = ''
124 subject = 'Unexpected host assignments'
125 for offending_job in self.get_overlapping_jobs():
126 message += ('HQE %s is using a host in use by another job.\n' %
127 offending_job)
128 if message:
129 email_manager.manager.enqueue_notify_email(subject, message)
130
131
Alex Milleree632912013-10-08 16:03:12 -0700132 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +0000133 def _check_for_db_inconsistencies(self):
showard01a51672009-05-29 18:42:37 +0000134 logging.info('Cleaning db inconsistencies')
135 self._check_all_invalid_related_objects()
mblighf3294cc2009-04-08 21:17:38 +0000136
showard01a51672009-05-29 18:42:37 +0000137
138 def _check_invalid_related_objects_one_way(self, first_model,
139 relation_field, second_model):
140 if 'invalid' not in first_model.get_field_dict():
141 return []
142 invalid_objects = list(first_model.objects.filter(invalid=True))
143 first_model.objects.populate_relationships(invalid_objects,
144 second_model,
145 'related_objects')
146 error_lines = []
147 for invalid_object in invalid_objects:
148 if invalid_object.related_objects:
149 related_list = ', '.join(str(related_object) for related_object
150 in invalid_object.related_objects)
151 error_lines.append('Invalid %s %s is related to %ss: %s'
152 % (first_model.__name__, invalid_object,
153 second_model.__name__, related_list))
154 related_manager = getattr(invalid_object, relation_field)
155 related_manager.clear()
156 return error_lines
157
158
159 def _check_invalid_related_objects(self, first_model, first_field,
160 second_model, second_field):
161 errors = self._check_invalid_related_objects_one_way(
162 first_model, first_field, second_model)
163 errors.extend(self._check_invalid_related_objects_one_way(
164 second_model, second_field, first_model))
165 return errors
166
167
168 def _check_all_invalid_related_objects(self):
169 model_pairs = ((models.Host, 'labels', models.Label, 'host_set'),
170 (models.AclGroup, 'hosts', models.Host, 'aclgroup_set'),
171 (models.AclGroup, 'users', models.User, 'aclgroup_set'),
172 (models.Test, 'dependency_labels', models.Label,
173 'test_set'))
174 errors = []
175 for first_model, first_field, second_model, second_field in model_pairs:
176 errors.extend(self._check_invalid_related_objects(
177 first_model, first_field, second_model, second_field))
178
179 if errors:
180 subject = ('%s relationships to invalid models, cleaned all' %
181 len(errors))
182 message = '\n'.join(errors)
183 logging.warning(subject)
184 logging.warning(message)
mblighf3294cc2009-04-08 21:17:38 +0000185 email_manager.manager.enqueue_notify_email(subject, message)
186
187
Alex Milleree632912013-10-08 16:03:12 -0700188 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +0000189 def _clear_inactive_blocks(self):
190 msg = 'Clear out blocks for all completed jobs.'
191 logging.info(msg)
192 # this would be simpler using NOT IN (subquery), but MySQL
193 # treats all IN subqueries as dependent, so this optimizes much
194 # better
195 self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000196 DELETE ihq FROM afe_ineligible_host_queues ihq
197 LEFT JOIN (SELECT DISTINCT job_id FROM afe_host_queue_entries
mblighf3294cc2009-04-08 21:17:38 +0000198 WHERE NOT complete) hqe
199 USING (job_id) WHERE hqe.job_id IS NULL""")
200
201
showard8dbd05a2010-01-12 18:54:59 +0000202 def _should_reverify_hosts_now(self):
203 reverify_period_sec = (scheduler_config.config.reverify_period_minutes
204 * 60)
205 if reverify_period_sec == 0:
206 return False
207 return (self._last_reverify_time + reverify_period_sec) <= time.time()
208
209
Eric Lie0493a42010-11-15 13:05:43 -0800210 def _choose_subset_of_hosts_to_reverify(self, hosts):
211 """Given hosts needing verification, return a subset to reverify."""
212 max_at_once = scheduler_config.config.reverify_max_hosts_at_once
213 if (max_at_once > 0 and len(hosts) > max_at_once):
214 return random.sample(hosts, max_at_once)
215 return sorted(hosts)
216
217
Alex Milleree632912013-10-08 16:03:12 -0700218 @timer.decorate
showard8dbd05a2010-01-12 18:54:59 +0000219 def _reverify_dead_hosts(self):
220 if not self._should_reverify_hosts_now():
221 return
222
223 self._last_reverify_time = time.time()
224 logging.info('Checking for dead hosts to reverify')
225 hosts = models.Host.objects.filter(
226 status=models.Host.Status.REPAIR_FAILED,
227 locked=False,
228 invalid=False)
229 hosts = hosts.exclude(
230 protection=host_protections.Protection.DO_NOT_VERIFY)
231 if not hosts:
232 return
233
Eric Lie0493a42010-11-15 13:05:43 -0800234 hosts = list(hosts)
235 total_hosts = len(hosts)
236 hosts = self._choose_subset_of_hosts_to_reverify(hosts)
237 logging.info('Reverifying dead hosts (%d of %d) %s', len(hosts),
238 total_hosts, ', '.join(host.hostname for host in hosts))
showard8dbd05a2010-01-12 18:54:59 +0000239 for host in hosts:
showardbe030fb2010-01-15 00:21:20 +0000240 models.SpecialTask.schedule_special_task(
showard8dbd05a2010-01-12 18:54:59 +0000241 host=host, task=models.SpecialTask.Task.VERIFY)
242
243
mblighf3294cc2009-04-08 21:17:38 +0000244class TwentyFourHourUpkeep(PeriodicCleanup):
245 """Cleanup that runs at the startup of monitor_db and every subsequent
246 twenty four hours.
247 """
Alex Milleree632912013-10-08 16:03:12 -0700248 timer = stats.Timer('monitor_db_cleanup.twentyfourhour_cleanup')
mblighf3294cc2009-04-08 21:17:38 +0000249
250
251 def __init__(self, db, run_at_initialize=True):
252 clean_interval = 24 * 60 # 24 hours
253 super(TwentyFourHourUpkeep, self).__init__(
254 db, clean_interval, run_at_initialize=run_at_initialize)
255
256
Alex Milleree632912013-10-08 16:03:12 -0700257 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +0000258 def _cleanup(self):
259 logging.info('Running 24 hour clean up')
260 self._django_session_cleanup()
showard01a51672009-05-29 18:42:37 +0000261 self._check_for_uncleanable_db_inconsistencies()
mblighf3294cc2009-04-08 21:17:38 +0000262
263
Alex Milleree632912013-10-08 16:03:12 -0700264 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +0000265 def _django_session_cleanup(self):
266 """Clean up django_session since django doesn't for us.
267 http://www.djangoproject.com/documentation/0.96/sessions/
268 """
269 logging.info('Deleting old sessions from django_session')
Scott Zawalski5649cff2012-08-13 14:48:04 -0400270 sql = 'TRUNCATE TABLE django_session'
mblighf3294cc2009-04-08 21:17:38 +0000271 self._db.execute(sql)
showard01a51672009-05-29 18:42:37 +0000272
273
Alex Milleree632912013-10-08 16:03:12 -0700274 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000275 def _check_for_uncleanable_db_inconsistencies(self):
276 logging.info('Checking for uncleanable DB inconsistencies')
277 self._check_for_active_and_complete_queue_entries()
278 self._check_for_multiple_platform_hosts()
279 self._check_for_no_platform_hosts()
showard6157c632009-07-06 20:19:31 +0000280 self._check_for_multiple_atomic_group_hosts()
showard01a51672009-05-29 18:42:37 +0000281
282
Alex Milleree632912013-10-08 16:03:12 -0700283 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000284 def _check_for_active_and_complete_queue_entries(self):
285 query = models.HostQueueEntry.objects.filter(active=True, complete=True)
286 if query.count() != 0:
287 subject = ('%d queue entries found with active=complete=1'
288 % query.count())
Simran Basi1c5b0572012-10-11 11:27:51 -0700289 lines = []
290 for entry in query:
291 lines.append(str(entry.get_object_dict()))
292 if entry.status == 'Aborted':
293 logging.error('Aborted entry: %s is both active and '
294 'complete. Setting active value to False.',
295 str(entry))
296 entry.active = False
297 entry.save()
showard01a51672009-05-29 18:42:37 +0000298 self._send_inconsistency_message(subject, lines)
299
300
Alex Milleree632912013-10-08 16:03:12 -0700301 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000302 def _check_for_multiple_platform_hosts(self):
303 rows = self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000304 SELECT afe_hosts.id, hostname, COUNT(1) AS platform_count,
305 GROUP_CONCAT(afe_labels.name)
306 FROM afe_hosts
307 INNER JOIN afe_hosts_labels ON
308 afe_hosts.id = afe_hosts_labels.host_id
309 INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
310 WHERE afe_labels.platform
311 GROUP BY afe_hosts.id
showard01a51672009-05-29 18:42:37 +0000312 HAVING platform_count > 1
313 ORDER BY hostname""")
314 if rows:
315 subject = '%s hosts with multiple platforms' % self._db.rowcount
316 lines = [' '.join(str(item) for item in row)
317 for row in rows]
318 self._send_inconsistency_message(subject, lines)
319
320
Alex Milleree632912013-10-08 16:03:12 -0700321 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000322 def _check_for_no_platform_hosts(self):
323 rows = self._db.execute("""
324 SELECT hostname
showardeab66ce2009-12-23 00:03:56 +0000325 FROM afe_hosts
326 LEFT JOIN afe_hosts_labels
327 ON afe_hosts.id = afe_hosts_labels.host_id
328 AND afe_hosts_labels.label_id IN (SELECT id FROM afe_labels
329 WHERE platform)
330 WHERE NOT afe_hosts.invalid AND afe_hosts_labels.host_id IS NULL""")
showard01a51672009-05-29 18:42:37 +0000331 if rows:
Ilja H. Friedel04be2bd2014-05-07 21:29:59 -0700332 logging.warning('%s hosts with no platform\n%s', self._db.rowcount,
jamesren675bfe72010-02-19 21:56:13 +0000333 ', '.join(row[0] for row in rows))
showard01a51672009-05-29 18:42:37 +0000334
335
Alex Milleree632912013-10-08 16:03:12 -0700336 @timer.decorate
showard6157c632009-07-06 20:19:31 +0000337 def _check_for_multiple_atomic_group_hosts(self):
338 rows = self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000339 SELECT afe_hosts.id, hostname,
340 COUNT(DISTINCT afe_atomic_groups.name) AS atomic_group_count,
341 GROUP_CONCAT(afe_labels.name),
342 GROUP_CONCAT(afe_atomic_groups.name)
343 FROM afe_hosts
344 INNER JOIN afe_hosts_labels ON
345 afe_hosts.id = afe_hosts_labels.host_id
346 INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
347 INNER JOIN afe_atomic_groups ON
348 afe_labels.atomic_group_id = afe_atomic_groups.id
349 WHERE NOT afe_hosts.invalid AND NOT afe_labels.invalid
350 GROUP BY afe_hosts.id
showard6157c632009-07-06 20:19:31 +0000351 HAVING atomic_group_count > 1
352 ORDER BY hostname""")
353 if rows:
354 subject = '%s hosts with multiple atomic groups' % self._db.rowcount
355 lines = [' '.join(str(item) for item in row)
356 for row in rows]
357 self._send_inconsistency_message(subject, lines)
358
359
showard01a51672009-05-29 18:42:37 +0000360 def _send_inconsistency_message(self, subject, lines):
361 logging.error(subject)
362 message = '\n'.join(lines)
363 if len(message) > 5000:
364 message = message[:5000] + '\n(truncated)\n'
365 email_manager.manager.enqueue_notify_email(subject, message)