blob: 0ce8e5fc77e446b493365ca873b7d4c6929e3299 [file] [log] [blame]
mblighf3294cc2009-04-08 21:17:38 +00001"""
2Autotest AFE Cleanup used by the scheduler
3"""
4
5
Michael Liangda8c60a2014-06-03 13:24:51 -07006import time, logging, random
7
mblighf3294cc2009-04-08 21:17:38 +00008from autotest_lib.frontend.afe import models
9from autotest_lib.scheduler import email_manager, scheduler_config
showard8dbd05a2010-01-12 18:54:59 +000010from autotest_lib.client.common_lib import host_protections
Michael Liangda8c60a2014-06-03 13:24:51 -070011from autotest_lib.client.common_lib.cros.graphite import stats
mblighf3294cc2009-04-08 21:17:38 +000012
13class PeriodicCleanup(object):
14
15
16 def __init__(self, db, clean_interval, run_at_initialize=False):
17 self._db = db
18 self.clean_interval = clean_interval
19 self._last_clean_time = time.time()
showard915958d2009-04-22 21:00:58 +000020 self._run_at_initialize = run_at_initialize
21
22
23 def initialize(self):
24 if self._run_at_initialize:
mblighf3294cc2009-04-08 21:17:38 +000025 self._cleanup()
26
27
28 def run_cleanup_maybe(self):
29 should_cleanup = (self._last_clean_time + self.clean_interval * 60
30 < time.time())
31 if should_cleanup:
32 self._cleanup()
33 self._last_clean_time = time.time()
34
35
36 def _cleanup(self):
37 """Abrstract cleanup method."""
38 raise NotImplementedError
39
40
41class UserCleanup(PeriodicCleanup):
42 """User cleanup that is controlled by the global config variable
43 clean_interval in the SCHEDULER section.
44 """
Alex Milleree632912013-10-08 16:03:12 -070045 timer = stats.Timer('monitor_db_cleanup.user_cleanup')
mblighf3294cc2009-04-08 21:17:38 +000046
47
48 def __init__(self, db, clean_interval_minutes):
49 super(UserCleanup, self).__init__(db, clean_interval_minutes)
showard8dbd05a2010-01-12 18:54:59 +000050 self._last_reverify_time = time.time()
mblighf3294cc2009-04-08 21:17:38 +000051
52
Alex Milleree632912013-10-08 16:03:12 -070053 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +000054 def _cleanup(self):
mbligh1ef218d2009-08-03 16:57:56 +000055 logging.info('Running periodic cleanup')
56 self._abort_timed_out_jobs()
mbligh1ef218d2009-08-03 16:57:56 +000057 self._abort_jobs_past_max_runtime()
58 self._clear_inactive_blocks()
59 self._check_for_db_inconsistencies()
showard8dbd05a2010-01-12 18:54:59 +000060 self._reverify_dead_hosts()
Simran Basi742b81d2014-05-30 13:51:06 -070061 self._django_session_cleanup()
mblighf3294cc2009-04-08 21:17:38 +000062
63
Alex Milleree632912013-10-08 16:03:12 -070064 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +000065 def _abort_timed_out_jobs(self):
66 msg = 'Aborting all jobs that have timed out and are not complete'
67 logging.info(msg)
68 query = models.Job.objects.filter(hostqueueentry__complete=False).extra(
Simran Basi7e605742013-11-12 13:43:36 -080069 where=['created_on + INTERVAL timeout_mins MINUTE < NOW()'])
mblighf3294cc2009-04-08 21:17:38 +000070 for job in query.distinct():
71 logging.warning('Aborting job %d due to job timeout', job.id)
showard64a95952010-01-13 21:27:16 +000072 job.abort()
mblighf3294cc2009-04-08 21:17:38 +000073
74
Alex Milleree632912013-10-08 16:03:12 -070075 @timer.decorate
showard12f3e322009-05-13 21:27:42 +000076 def _abort_jobs_past_max_runtime(self):
77 """
78 Abort executions that have started and are past the job's max runtime.
79 """
80 logging.info('Aborting all jobs that have passed maximum runtime')
81 rows = self._db.execute("""
82 SELECT hqe.id
showardeab66ce2009-12-23 00:03:56 +000083 FROM afe_host_queue_entries AS hqe
84 INNER JOIN afe_jobs ON (hqe.job_id = afe_jobs.id)
showard12f3e322009-05-13 21:27:42 +000085 WHERE NOT hqe.complete AND NOT hqe.aborted AND
Simran Basi34217022012-11-06 13:43:15 -080086 hqe.started_on + INTERVAL afe_jobs.max_runtime_mins MINUTE <
87 NOW()""")
showard12f3e322009-05-13 21:27:42 +000088 query = models.HostQueueEntry.objects.filter(
89 id__in=[row[0] for row in rows])
90 for queue_entry in query.distinct():
91 logging.warning('Aborting entry %s due to max runtime', queue_entry)
showard64a95952010-01-13 21:27:16 +000092 queue_entry.abort()
showard12f3e322009-05-13 21:27:42 +000093
94
Alex Milleree632912013-10-08 16:03:12 -070095 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +000096 def _check_for_db_inconsistencies(self):
showard01a51672009-05-29 18:42:37 +000097 logging.info('Cleaning db inconsistencies')
98 self._check_all_invalid_related_objects()
mblighf3294cc2009-04-08 21:17:38 +000099
showard01a51672009-05-29 18:42:37 +0000100
101 def _check_invalid_related_objects_one_way(self, first_model,
102 relation_field, second_model):
103 if 'invalid' not in first_model.get_field_dict():
104 return []
105 invalid_objects = list(first_model.objects.filter(invalid=True))
106 first_model.objects.populate_relationships(invalid_objects,
107 second_model,
108 'related_objects')
109 error_lines = []
110 for invalid_object in invalid_objects:
111 if invalid_object.related_objects:
112 related_list = ', '.join(str(related_object) for related_object
113 in invalid_object.related_objects)
114 error_lines.append('Invalid %s %s is related to %ss: %s'
115 % (first_model.__name__, invalid_object,
116 second_model.__name__, related_list))
117 related_manager = getattr(invalid_object, relation_field)
118 related_manager.clear()
119 return error_lines
120
121
122 def _check_invalid_related_objects(self, first_model, first_field,
123 second_model, second_field):
124 errors = self._check_invalid_related_objects_one_way(
125 first_model, first_field, second_model)
126 errors.extend(self._check_invalid_related_objects_one_way(
127 second_model, second_field, first_model))
128 return errors
129
130
131 def _check_all_invalid_related_objects(self):
132 model_pairs = ((models.Host, 'labels', models.Label, 'host_set'),
133 (models.AclGroup, 'hosts', models.Host, 'aclgroup_set'),
134 (models.AclGroup, 'users', models.User, 'aclgroup_set'),
135 (models.Test, 'dependency_labels', models.Label,
136 'test_set'))
137 errors = []
138 for first_model, first_field, second_model, second_field in model_pairs:
139 errors.extend(self._check_invalid_related_objects(
140 first_model, first_field, second_model, second_field))
141
142 if errors:
143 subject = ('%s relationships to invalid models, cleaned all' %
144 len(errors))
145 message = '\n'.join(errors)
146 logging.warning(subject)
147 logging.warning(message)
mblighf3294cc2009-04-08 21:17:38 +0000148 email_manager.manager.enqueue_notify_email(subject, message)
149
150
Alex Milleree632912013-10-08 16:03:12 -0700151 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +0000152 def _clear_inactive_blocks(self):
153 msg = 'Clear out blocks for all completed jobs.'
154 logging.info(msg)
155 # this would be simpler using NOT IN (subquery), but MySQL
156 # treats all IN subqueries as dependent, so this optimizes much
157 # better
158 self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000159 DELETE ihq FROM afe_ineligible_host_queues ihq
160 LEFT JOIN (SELECT DISTINCT job_id FROM afe_host_queue_entries
mblighf3294cc2009-04-08 21:17:38 +0000161 WHERE NOT complete) hqe
162 USING (job_id) WHERE hqe.job_id IS NULL""")
163
164
showard8dbd05a2010-01-12 18:54:59 +0000165 def _should_reverify_hosts_now(self):
166 reverify_period_sec = (scheduler_config.config.reverify_period_minutes
167 * 60)
168 if reverify_period_sec == 0:
169 return False
170 return (self._last_reverify_time + reverify_period_sec) <= time.time()
171
172
Eric Lie0493a42010-11-15 13:05:43 -0800173 def _choose_subset_of_hosts_to_reverify(self, hosts):
174 """Given hosts needing verification, return a subset to reverify."""
175 max_at_once = scheduler_config.config.reverify_max_hosts_at_once
176 if (max_at_once > 0 and len(hosts) > max_at_once):
177 return random.sample(hosts, max_at_once)
178 return sorted(hosts)
179
180
Alex Milleree632912013-10-08 16:03:12 -0700181 @timer.decorate
showard8dbd05a2010-01-12 18:54:59 +0000182 def _reverify_dead_hosts(self):
183 if not self._should_reverify_hosts_now():
184 return
185
186 self._last_reverify_time = time.time()
187 logging.info('Checking for dead hosts to reverify')
188 hosts = models.Host.objects.filter(
189 status=models.Host.Status.REPAIR_FAILED,
190 locked=False,
191 invalid=False)
192 hosts = hosts.exclude(
193 protection=host_protections.Protection.DO_NOT_VERIFY)
194 if not hosts:
195 return
196
Eric Lie0493a42010-11-15 13:05:43 -0800197 hosts = list(hosts)
198 total_hosts = len(hosts)
199 hosts = self._choose_subset_of_hosts_to_reverify(hosts)
200 logging.info('Reverifying dead hosts (%d of %d) %s', len(hosts),
201 total_hosts, ', '.join(host.hostname for host in hosts))
showard8dbd05a2010-01-12 18:54:59 +0000202 for host in hosts:
showardbe030fb2010-01-15 00:21:20 +0000203 models.SpecialTask.schedule_special_task(
showard8dbd05a2010-01-12 18:54:59 +0000204 host=host, task=models.SpecialTask.Task.VERIFY)
205
206
Simran Basi742b81d2014-05-30 13:51:06 -0700207 @timer.decorate
208 def _django_session_cleanup(self):
209 """Clean up django_session since django doesn't for us.
210 http://www.djangoproject.com/documentation/0.96/sessions/
211 """
212 logging.info('Deleting old sessions from django_session')
213 sql = 'TRUNCATE TABLE django_session'
214 self._db.execute(sql)
215
216
mblighf3294cc2009-04-08 21:17:38 +0000217class TwentyFourHourUpkeep(PeriodicCleanup):
218 """Cleanup that runs at the startup of monitor_db and every subsequent
219 twenty four hours.
220 """
Alex Milleree632912013-10-08 16:03:12 -0700221 timer = stats.Timer('monitor_db_cleanup.twentyfourhour_cleanup')
mblighf3294cc2009-04-08 21:17:38 +0000222
223
224 def __init__(self, db, run_at_initialize=True):
225 clean_interval = 24 * 60 # 24 hours
226 super(TwentyFourHourUpkeep, self).__init__(
227 db, clean_interval, run_at_initialize=run_at_initialize)
228
229
Alex Milleree632912013-10-08 16:03:12 -0700230 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +0000231 def _cleanup(self):
232 logging.info('Running 24 hour clean up')
showard01a51672009-05-29 18:42:37 +0000233 self._check_for_uncleanable_db_inconsistencies()
mblighf3294cc2009-04-08 21:17:38 +0000234
235
Alex Milleree632912013-10-08 16:03:12 -0700236 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000237 def _check_for_uncleanable_db_inconsistencies(self):
238 logging.info('Checking for uncleanable DB inconsistencies')
239 self._check_for_active_and_complete_queue_entries()
240 self._check_for_multiple_platform_hosts()
241 self._check_for_no_platform_hosts()
showard6157c632009-07-06 20:19:31 +0000242 self._check_for_multiple_atomic_group_hosts()
showard01a51672009-05-29 18:42:37 +0000243
244
Alex Milleree632912013-10-08 16:03:12 -0700245 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000246 def _check_for_active_and_complete_queue_entries(self):
247 query = models.HostQueueEntry.objects.filter(active=True, complete=True)
248 if query.count() != 0:
249 subject = ('%d queue entries found with active=complete=1'
250 % query.count())
Simran Basi1c5b0572012-10-11 11:27:51 -0700251 lines = []
252 for entry in query:
253 lines.append(str(entry.get_object_dict()))
254 if entry.status == 'Aborted':
255 logging.error('Aborted entry: %s is both active and '
256 'complete. Setting active value to False.',
257 str(entry))
258 entry.active = False
259 entry.save()
showard01a51672009-05-29 18:42:37 +0000260 self._send_inconsistency_message(subject, lines)
261
262
Alex Milleree632912013-10-08 16:03:12 -0700263 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000264 def _check_for_multiple_platform_hosts(self):
265 rows = self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000266 SELECT afe_hosts.id, hostname, COUNT(1) AS platform_count,
267 GROUP_CONCAT(afe_labels.name)
268 FROM afe_hosts
269 INNER JOIN afe_hosts_labels ON
270 afe_hosts.id = afe_hosts_labels.host_id
271 INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
272 WHERE afe_labels.platform
273 GROUP BY afe_hosts.id
showard01a51672009-05-29 18:42:37 +0000274 HAVING platform_count > 1
275 ORDER BY hostname""")
276 if rows:
277 subject = '%s hosts with multiple platforms' % self._db.rowcount
278 lines = [' '.join(str(item) for item in row)
279 for row in rows]
280 self._send_inconsistency_message(subject, lines)
281
282
Alex Milleree632912013-10-08 16:03:12 -0700283 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000284 def _check_for_no_platform_hosts(self):
285 rows = self._db.execute("""
286 SELECT hostname
showardeab66ce2009-12-23 00:03:56 +0000287 FROM afe_hosts
288 LEFT JOIN afe_hosts_labels
289 ON afe_hosts.id = afe_hosts_labels.host_id
290 AND afe_hosts_labels.label_id IN (SELECT id FROM afe_labels
291 WHERE platform)
292 WHERE NOT afe_hosts.invalid AND afe_hosts_labels.host_id IS NULL""")
showard01a51672009-05-29 18:42:37 +0000293 if rows:
Ilja H. Friedel04be2bd2014-05-07 21:29:59 -0700294 logging.warning('%s hosts with no platform\n%s', self._db.rowcount,
jamesren675bfe72010-02-19 21:56:13 +0000295 ', '.join(row[0] for row in rows))
showard01a51672009-05-29 18:42:37 +0000296
297
Alex Milleree632912013-10-08 16:03:12 -0700298 @timer.decorate
showard6157c632009-07-06 20:19:31 +0000299 def _check_for_multiple_atomic_group_hosts(self):
300 rows = self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000301 SELECT afe_hosts.id, hostname,
302 COUNT(DISTINCT afe_atomic_groups.name) AS atomic_group_count,
303 GROUP_CONCAT(afe_labels.name),
304 GROUP_CONCAT(afe_atomic_groups.name)
305 FROM afe_hosts
306 INNER JOIN afe_hosts_labels ON
307 afe_hosts.id = afe_hosts_labels.host_id
308 INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
309 INNER JOIN afe_atomic_groups ON
310 afe_labels.atomic_group_id = afe_atomic_groups.id
311 WHERE NOT afe_hosts.invalid AND NOT afe_labels.invalid
312 GROUP BY afe_hosts.id
showard6157c632009-07-06 20:19:31 +0000313 HAVING atomic_group_count > 1
314 ORDER BY hostname""")
315 if rows:
316 subject = '%s hosts with multiple atomic groups' % self._db.rowcount
317 lines = [' '.join(str(item) for item in row)
318 for row in rows]
319 self._send_inconsistency_message(subject, lines)
320
321
showard01a51672009-05-29 18:42:37 +0000322 def _send_inconsistency_message(self, subject, lines):
323 logging.error(subject)
324 message = '\n'.join(lines)
325 if len(message) > 5000:
326 message = message[:5000] + '\n(truncated)\n'
327 email_manager.manager.enqueue_notify_email(subject, message)