blob: fd5d73bf3ed3716c618b9ecbdce870fd6f630961 [file] [log] [blame]
mblighf3294cc2009-04-08 21:17:38 +00001"""
2Autotest AFE Cleanup used by the scheduler
3"""
4
5
Aviv Keshetf2c9ac92016-09-16 15:44:26 -07006import logging
7import random
8import time
9
10from chromite.lib import metrics
Michael Liangda8c60a2014-06-03 13:24:51 -070011
mblighf3294cc2009-04-08 21:17:38 +000012from autotest_lib.frontend.afe import models
Dan Shic458f662015-04-29 12:12:38 -070013from autotest_lib.scheduler import email_manager
14from autotest_lib.scheduler import scheduler_config
15from autotest_lib.client.common_lib import global_config
showard8dbd05a2010-01-12 18:54:59 +000016from autotest_lib.client.common_lib import host_protections
mblighf3294cc2009-04-08 21:17:38 +000017
mblighf3294cc2009-04-08 21:17:38 +000018
Dan Shic458f662015-04-29 12:12:38 -070019class PeriodicCleanup(object):
20 """Base class to schedule periodical cleanup work.
21 """
mblighf3294cc2009-04-08 21:17:38 +000022
Alex Millerac189f32014-06-23 13:55:23 -070023 def __init__(self, db, clean_interval_minutes, run_at_initialize=False):
mblighf3294cc2009-04-08 21:17:38 +000024 self._db = db
Alex Millerac189f32014-06-23 13:55:23 -070025 self.clean_interval_minutes = clean_interval_minutes
mblighf3294cc2009-04-08 21:17:38 +000026 self._last_clean_time = time.time()
showard915958d2009-04-22 21:00:58 +000027 self._run_at_initialize = run_at_initialize
28
29
30 def initialize(self):
Dan Shic458f662015-04-29 12:12:38 -070031 """Method called by scheduler at the startup.
32 """
showard915958d2009-04-22 21:00:58 +000033 if self._run_at_initialize:
mblighf3294cc2009-04-08 21:17:38 +000034 self._cleanup()
35
36
37 def run_cleanup_maybe(self):
Dan Shic458f662015-04-29 12:12:38 -070038 """Test if cleanup method should be called.
39 """
Alex Millerac189f32014-06-23 13:55:23 -070040 should_cleanup = (self._last_clean_time +
41 self.clean_interval_minutes * 60
mblighf3294cc2009-04-08 21:17:38 +000042 < time.time())
43 if should_cleanup:
44 self._cleanup()
45 self._last_clean_time = time.time()
46
47
48 def _cleanup(self):
49 """Abrstract cleanup method."""
50 raise NotImplementedError
51
52
53class UserCleanup(PeriodicCleanup):
54 """User cleanup that is controlled by the global config variable
Alex Millerac189f32014-06-23 13:55:23 -070055 clean_interval_minutes in the SCHEDULER section.
mblighf3294cc2009-04-08 21:17:38 +000056 """
mblighf3294cc2009-04-08 21:17:38 +000057
58 def __init__(self, db, clean_interval_minutes):
59 super(UserCleanup, self).__init__(db, clean_interval_minutes)
showard8dbd05a2010-01-12 18:54:59 +000060 self._last_reverify_time = time.time()
mblighf3294cc2009-04-08 21:17:38 +000061
62
Aviv Keshetf2c9ac92016-09-16 15:44:26 -070063 @metrics.SecondsTimerDecorator(
64 'chromeos/autotest/scheduler/cleanup/user/durations')
mblighf3294cc2009-04-08 21:17:38 +000065 def _cleanup(self):
mbligh1ef218d2009-08-03 16:57:56 +000066 logging.info('Running periodic cleanup')
67 self._abort_timed_out_jobs()
mbligh1ef218d2009-08-03 16:57:56 +000068 self._abort_jobs_past_max_runtime()
69 self._clear_inactive_blocks()
70 self._check_for_db_inconsistencies()
showard8dbd05a2010-01-12 18:54:59 +000071 self._reverify_dead_hosts()
Simran Basi742b81d2014-05-30 13:51:06 -070072 self._django_session_cleanup()
mblighf3294cc2009-04-08 21:17:38 +000073
74
75 def _abort_timed_out_jobs(self):
76 msg = 'Aborting all jobs that have timed out and are not complete'
77 logging.info(msg)
78 query = models.Job.objects.filter(hostqueueentry__complete=False).extra(
Simran Basi7e605742013-11-12 13:43:36 -080079 where=['created_on + INTERVAL timeout_mins MINUTE < NOW()'])
mblighf3294cc2009-04-08 21:17:38 +000080 for job in query.distinct():
81 logging.warning('Aborting job %d due to job timeout', job.id)
showard64a95952010-01-13 21:27:16 +000082 job.abort()
mblighf3294cc2009-04-08 21:17:38 +000083
84
showard12f3e322009-05-13 21:27:42 +000085 def _abort_jobs_past_max_runtime(self):
86 """
87 Abort executions that have started and are past the job's max runtime.
88 """
89 logging.info('Aborting all jobs that have passed maximum runtime')
90 rows = self._db.execute("""
Dan Shi0a1bb172016-09-17 21:50:17 -070091 SELECT hqe.id FROM afe_host_queue_entries AS hqe
92 WHERE NOT hqe.complete AND NOT hqe.aborted AND EXISTS
93 (select * from afe_jobs where hqe.job_id=afe_jobs.id and
94 hqe.started_on + INTERVAL afe_jobs.max_runtime_mins MINUTE < NOW())
95 """)
showard12f3e322009-05-13 21:27:42 +000096 query = models.HostQueueEntry.objects.filter(
97 id__in=[row[0] for row in rows])
98 for queue_entry in query.distinct():
99 logging.warning('Aborting entry %s due to max runtime', queue_entry)
showard64a95952010-01-13 21:27:16 +0000100 queue_entry.abort()
showard12f3e322009-05-13 21:27:42 +0000101
102
mblighf3294cc2009-04-08 21:17:38 +0000103 def _check_for_db_inconsistencies(self):
showard01a51672009-05-29 18:42:37 +0000104 logging.info('Cleaning db inconsistencies')
105 self._check_all_invalid_related_objects()
mblighf3294cc2009-04-08 21:17:38 +0000106
showard01a51672009-05-29 18:42:37 +0000107
108 def _check_invalid_related_objects_one_way(self, first_model,
109 relation_field, second_model):
110 if 'invalid' not in first_model.get_field_dict():
111 return []
112 invalid_objects = list(first_model.objects.filter(invalid=True))
113 first_model.objects.populate_relationships(invalid_objects,
114 second_model,
115 'related_objects')
116 error_lines = []
117 for invalid_object in invalid_objects:
118 if invalid_object.related_objects:
119 related_list = ', '.join(str(related_object) for related_object
120 in invalid_object.related_objects)
121 error_lines.append('Invalid %s %s is related to %ss: %s'
122 % (first_model.__name__, invalid_object,
123 second_model.__name__, related_list))
124 related_manager = getattr(invalid_object, relation_field)
125 related_manager.clear()
126 return error_lines
127
128
129 def _check_invalid_related_objects(self, first_model, first_field,
130 second_model, second_field):
131 errors = self._check_invalid_related_objects_one_way(
132 first_model, first_field, second_model)
133 errors.extend(self._check_invalid_related_objects_one_way(
134 second_model, second_field, first_model))
135 return errors
136
137
138 def _check_all_invalid_related_objects(self):
139 model_pairs = ((models.Host, 'labels', models.Label, 'host_set'),
140 (models.AclGroup, 'hosts', models.Host, 'aclgroup_set'),
141 (models.AclGroup, 'users', models.User, 'aclgroup_set'),
142 (models.Test, 'dependency_labels', models.Label,
143 'test_set'))
144 errors = []
145 for first_model, first_field, second_model, second_field in model_pairs:
146 errors.extend(self._check_invalid_related_objects(
147 first_model, first_field, second_model, second_field))
148
149 if errors:
Aviv Keshetc29b4c72016-12-14 22:27:35 -0800150 m = 'chromeos/autotest/scheduler/cleanup/invalid_models_cleaned'
151 metrics.Counter(m).increment_by(len(errors))
152 logging.warn('Cleaned invalid models due to errors: %s'
153 % ('\n'.join(errors)))
mblighf3294cc2009-04-08 21:17:38 +0000154
155 def _clear_inactive_blocks(self):
156 msg = 'Clear out blocks for all completed jobs.'
157 logging.info(msg)
158 # this would be simpler using NOT IN (subquery), but MySQL
159 # treats all IN subqueries as dependent, so this optimizes much
160 # better
161 self._db.execute("""
Dan Shi3762c6b2016-09-16 16:24:07 -0700162 DELETE ihq FROM afe_ineligible_host_queues ihq
163 WHERE NOT EXISTS
164 (SELECT job_id FROM afe_host_queue_entries hqe
165 WHERE NOT hqe.complete AND hqe.job_id = ihq.job_id)""")
mblighf3294cc2009-04-08 21:17:38 +0000166
167
showard8dbd05a2010-01-12 18:54:59 +0000168 def _should_reverify_hosts_now(self):
169 reverify_period_sec = (scheduler_config.config.reverify_period_minutes
170 * 60)
171 if reverify_period_sec == 0:
172 return False
173 return (self._last_reverify_time + reverify_period_sec) <= time.time()
174
175
Eric Lie0493a42010-11-15 13:05:43 -0800176 def _choose_subset_of_hosts_to_reverify(self, hosts):
177 """Given hosts needing verification, return a subset to reverify."""
178 max_at_once = scheduler_config.config.reverify_max_hosts_at_once
179 if (max_at_once > 0 and len(hosts) > max_at_once):
180 return random.sample(hosts, max_at_once)
181 return sorted(hosts)
182
183
showard8dbd05a2010-01-12 18:54:59 +0000184 def _reverify_dead_hosts(self):
185 if not self._should_reverify_hosts_now():
186 return
187
188 self._last_reverify_time = time.time()
189 logging.info('Checking for dead hosts to reverify')
190 hosts = models.Host.objects.filter(
191 status=models.Host.Status.REPAIR_FAILED,
192 locked=False,
193 invalid=False)
194 hosts = hosts.exclude(
195 protection=host_protections.Protection.DO_NOT_VERIFY)
196 if not hosts:
197 return
198
Eric Lie0493a42010-11-15 13:05:43 -0800199 hosts = list(hosts)
200 total_hosts = len(hosts)
201 hosts = self._choose_subset_of_hosts_to_reverify(hosts)
202 logging.info('Reverifying dead hosts (%d of %d) %s', len(hosts),
203 total_hosts, ', '.join(host.hostname for host in hosts))
showard8dbd05a2010-01-12 18:54:59 +0000204 for host in hosts:
showardbe030fb2010-01-15 00:21:20 +0000205 models.SpecialTask.schedule_special_task(
showard8dbd05a2010-01-12 18:54:59 +0000206 host=host, task=models.SpecialTask.Task.VERIFY)
207
208
Simran Basi742b81d2014-05-30 13:51:06 -0700209 def _django_session_cleanup(self):
210 """Clean up django_session since django doesn't for us.
211 http://www.djangoproject.com/documentation/0.96/sessions/
212 """
213 logging.info('Deleting old sessions from django_session')
214 sql = 'TRUNCATE TABLE django_session'
215 self._db.execute(sql)
216
217
mblighf3294cc2009-04-08 21:17:38 +0000218class TwentyFourHourUpkeep(PeriodicCleanup):
219 """Cleanup that runs at the startup of monitor_db and every subsequent
220 twenty four hours.
221 """
222
223
Dan Shic458f662015-04-29 12:12:38 -0700224 def __init__(self, db, drone_manager, run_at_initialize=True):
225 """Initialize TwentyFourHourUpkeep.
226
227 @param db: Database connection object.
228 @param drone_manager: DroneManager to access drones.
229 @param run_at_initialize: True to run cleanup when scheduler starts.
230 Default is set to True.
231
232 """
233 self.drone_manager = drone_manager
Alex Millerac189f32014-06-23 13:55:23 -0700234 clean_interval_minutes = 24 * 60 # 24 hours
mblighf3294cc2009-04-08 21:17:38 +0000235 super(TwentyFourHourUpkeep, self).__init__(
Alex Millerac189f32014-06-23 13:55:23 -0700236 db, clean_interval_minutes, run_at_initialize=run_at_initialize)
mblighf3294cc2009-04-08 21:17:38 +0000237
238
Aviv Keshetf2c9ac92016-09-16 15:44:26 -0700239 @metrics.SecondsTimerDecorator(
240 'chromeos/autotest/scheduler/cleanup/daily/durations')
mblighf3294cc2009-04-08 21:17:38 +0000241 def _cleanup(self):
242 logging.info('Running 24 hour clean up')
showard01a51672009-05-29 18:42:37 +0000243 self._check_for_uncleanable_db_inconsistencies()
Dan Shi55d58992015-05-05 09:10:02 -0700244 self._cleanup_orphaned_containers()
mblighf3294cc2009-04-08 21:17:38 +0000245
246
showard01a51672009-05-29 18:42:37 +0000247 def _check_for_uncleanable_db_inconsistencies(self):
248 logging.info('Checking for uncleanable DB inconsistencies')
249 self._check_for_active_and_complete_queue_entries()
250 self._check_for_multiple_platform_hosts()
251 self._check_for_no_platform_hosts()
showard6157c632009-07-06 20:19:31 +0000252 self._check_for_multiple_atomic_group_hosts()
showard01a51672009-05-29 18:42:37 +0000253
254
255 def _check_for_active_and_complete_queue_entries(self):
256 query = models.HostQueueEntry.objects.filter(active=True, complete=True)
257 if query.count() != 0:
258 subject = ('%d queue entries found with active=complete=1'
259 % query.count())
Simran Basi1c5b0572012-10-11 11:27:51 -0700260 lines = []
261 for entry in query:
262 lines.append(str(entry.get_object_dict()))
263 if entry.status == 'Aborted':
264 logging.error('Aborted entry: %s is both active and '
265 'complete. Setting active value to False.',
266 str(entry))
267 entry.active = False
268 entry.save()
showard01a51672009-05-29 18:42:37 +0000269 self._send_inconsistency_message(subject, lines)
270
271
272 def _check_for_multiple_platform_hosts(self):
273 rows = self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000274 SELECT afe_hosts.id, hostname, COUNT(1) AS platform_count,
275 GROUP_CONCAT(afe_labels.name)
276 FROM afe_hosts
277 INNER JOIN afe_hosts_labels ON
278 afe_hosts.id = afe_hosts_labels.host_id
279 INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
280 WHERE afe_labels.platform
281 GROUP BY afe_hosts.id
showard01a51672009-05-29 18:42:37 +0000282 HAVING platform_count > 1
283 ORDER BY hostname""")
284 if rows:
285 subject = '%s hosts with multiple platforms' % self._db.rowcount
286 lines = [' '.join(str(item) for item in row)
287 for row in rows]
288 self._send_inconsistency_message(subject, lines)
289
290
291 def _check_for_no_platform_hosts(self):
292 rows = self._db.execute("""
293 SELECT hostname
showardeab66ce2009-12-23 00:03:56 +0000294 FROM afe_hosts
295 LEFT JOIN afe_hosts_labels
296 ON afe_hosts.id = afe_hosts_labels.host_id
297 AND afe_hosts_labels.label_id IN (SELECT id FROM afe_labels
298 WHERE platform)
299 WHERE NOT afe_hosts.invalid AND afe_hosts_labels.host_id IS NULL""")
showard01a51672009-05-29 18:42:37 +0000300 if rows:
Ilja H. Friedel04be2bd2014-05-07 21:29:59 -0700301 logging.warning('%s hosts with no platform\n%s', self._db.rowcount,
jamesren675bfe72010-02-19 21:56:13 +0000302 ', '.join(row[0] for row in rows))
showard01a51672009-05-29 18:42:37 +0000303
304
showard6157c632009-07-06 20:19:31 +0000305 def _check_for_multiple_atomic_group_hosts(self):
306 rows = self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000307 SELECT afe_hosts.id, hostname,
308 COUNT(DISTINCT afe_atomic_groups.name) AS atomic_group_count,
309 GROUP_CONCAT(afe_labels.name),
310 GROUP_CONCAT(afe_atomic_groups.name)
311 FROM afe_hosts
312 INNER JOIN afe_hosts_labels ON
313 afe_hosts.id = afe_hosts_labels.host_id
314 INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
315 INNER JOIN afe_atomic_groups ON
316 afe_labels.atomic_group_id = afe_atomic_groups.id
317 WHERE NOT afe_hosts.invalid AND NOT afe_labels.invalid
318 GROUP BY afe_hosts.id
showard6157c632009-07-06 20:19:31 +0000319 HAVING atomic_group_count > 1
320 ORDER BY hostname""")
321 if rows:
322 subject = '%s hosts with multiple atomic groups' % self._db.rowcount
323 lines = [' '.join(str(item) for item in row)
324 for row in rows]
325 self._send_inconsistency_message(subject, lines)
326
327
showard01a51672009-05-29 18:42:37 +0000328 def _send_inconsistency_message(self, subject, lines):
329 logging.error(subject)
330 message = '\n'.join(lines)
331 if len(message) > 5000:
332 message = message[:5000] + '\n(truncated)\n'
333 email_manager.manager.enqueue_notify_email(subject, message)
Dan Shic458f662015-04-29 12:12:38 -0700334
335
Dan Shic458f662015-04-29 12:12:38 -0700336 def _cleanup_orphaned_containers(self):
337 """Cleanup orphaned containers in each drone.
338
339 The function queues a lxc_cleanup call in each drone without waiting for
340 the script to finish, as the cleanup procedure could take minutes and the
341 script output is logged.
342
343 """
344 ssp_enabled = global_config.global_config.get_config_value(
345 'AUTOSERV', 'enable_ssp_container')
346 if not ssp_enabled:
347 logging.info('Server-side packaging is not enabled, no need to clean'
348 ' up orphaned containers.')
349 return
350 self.drone_manager.cleanup_orphaned_containers()