blob: 51d3567b0b27cef4e33f25dc0d20aa9c2f2e8e8e [file] [log] [blame]
mblighf3294cc2009-04-08 21:17:38 +00001"""
2Autotest AFE Cleanup used by the scheduler
3"""
4
5
Aviv Keshetf2c9ac92016-09-16 15:44:26 -07006import logging
7import random
8import time
9
10from chromite.lib import metrics
Michael Liangda8c60a2014-06-03 13:24:51 -070011
mblighf3294cc2009-04-08 21:17:38 +000012from autotest_lib.frontend.afe import models
Dan Shic458f662015-04-29 12:12:38 -070013from autotest_lib.scheduler import email_manager
14from autotest_lib.scheduler import scheduler_config
15from autotest_lib.client.common_lib import global_config
showard8dbd05a2010-01-12 18:54:59 +000016from autotest_lib.client.common_lib import host_protections
mblighf3294cc2009-04-08 21:17:38 +000017
mblighf3294cc2009-04-08 21:17:38 +000018
Dan Shic458f662015-04-29 12:12:38 -070019class PeriodicCleanup(object):
20 """Base class to schedule periodical cleanup work.
21 """
mblighf3294cc2009-04-08 21:17:38 +000022
Alex Millerac189f32014-06-23 13:55:23 -070023 def __init__(self, db, clean_interval_minutes, run_at_initialize=False):
mblighf3294cc2009-04-08 21:17:38 +000024 self._db = db
Alex Millerac189f32014-06-23 13:55:23 -070025 self.clean_interval_minutes = clean_interval_minutes
mblighf3294cc2009-04-08 21:17:38 +000026 self._last_clean_time = time.time()
showard915958d2009-04-22 21:00:58 +000027 self._run_at_initialize = run_at_initialize
28
29
30 def initialize(self):
Dan Shic458f662015-04-29 12:12:38 -070031 """Method called by scheduler at the startup.
32 """
showard915958d2009-04-22 21:00:58 +000033 if self._run_at_initialize:
mblighf3294cc2009-04-08 21:17:38 +000034 self._cleanup()
35
36
37 def run_cleanup_maybe(self):
Dan Shic458f662015-04-29 12:12:38 -070038 """Test if cleanup method should be called.
39 """
Alex Millerac189f32014-06-23 13:55:23 -070040 should_cleanup = (self._last_clean_time +
41 self.clean_interval_minutes * 60
mblighf3294cc2009-04-08 21:17:38 +000042 < time.time())
43 if should_cleanup:
44 self._cleanup()
45 self._last_clean_time = time.time()
46
47
48 def _cleanup(self):
49 """Abrstract cleanup method."""
50 raise NotImplementedError
51
52
53class UserCleanup(PeriodicCleanup):
54 """User cleanup that is controlled by the global config variable
Alex Millerac189f32014-06-23 13:55:23 -070055 clean_interval_minutes in the SCHEDULER section.
mblighf3294cc2009-04-08 21:17:38 +000056 """
mblighf3294cc2009-04-08 21:17:38 +000057
58 def __init__(self, db, clean_interval_minutes):
59 super(UserCleanup, self).__init__(db, clean_interval_minutes)
showard8dbd05a2010-01-12 18:54:59 +000060 self._last_reverify_time = time.time()
mblighf3294cc2009-04-08 21:17:38 +000061
62
Aviv Keshetf2c9ac92016-09-16 15:44:26 -070063 @metrics.SecondsTimerDecorator(
64 'chromeos/autotest/scheduler/cleanup/user/durations')
mblighf3294cc2009-04-08 21:17:38 +000065 def _cleanup(self):
mbligh1ef218d2009-08-03 16:57:56 +000066 logging.info('Running periodic cleanup')
67 self._abort_timed_out_jobs()
mbligh1ef218d2009-08-03 16:57:56 +000068 self._abort_jobs_past_max_runtime()
69 self._clear_inactive_blocks()
70 self._check_for_db_inconsistencies()
showard8dbd05a2010-01-12 18:54:59 +000071 self._reverify_dead_hosts()
Simran Basi742b81d2014-05-30 13:51:06 -070072 self._django_session_cleanup()
mblighf3294cc2009-04-08 21:17:38 +000073
74
75 def _abort_timed_out_jobs(self):
76 msg = 'Aborting all jobs that have timed out and are not complete'
77 logging.info(msg)
78 query = models.Job.objects.filter(hostqueueentry__complete=False).extra(
Simran Basi7e605742013-11-12 13:43:36 -080079 where=['created_on + INTERVAL timeout_mins MINUTE < NOW()'])
mblighf3294cc2009-04-08 21:17:38 +000080 for job in query.distinct():
81 logging.warning('Aborting job %d due to job timeout', job.id)
showard64a95952010-01-13 21:27:16 +000082 job.abort()
mblighf3294cc2009-04-08 21:17:38 +000083
84
showard12f3e322009-05-13 21:27:42 +000085 def _abort_jobs_past_max_runtime(self):
86 """
87 Abort executions that have started and are past the job's max runtime.
88 """
89 logging.info('Aborting all jobs that have passed maximum runtime')
90 rows = self._db.execute("""
Dan Shi0a1bb172016-09-17 21:50:17 -070091 SELECT hqe.id FROM afe_host_queue_entries AS hqe
92 WHERE NOT hqe.complete AND NOT hqe.aborted AND EXISTS
93 (select * from afe_jobs where hqe.job_id=afe_jobs.id and
94 hqe.started_on + INTERVAL afe_jobs.max_runtime_mins MINUTE < NOW())
95 """)
showard12f3e322009-05-13 21:27:42 +000096 query = models.HostQueueEntry.objects.filter(
97 id__in=[row[0] for row in rows])
98 for queue_entry in query.distinct():
99 logging.warning('Aborting entry %s due to max runtime', queue_entry)
showard64a95952010-01-13 21:27:16 +0000100 queue_entry.abort()
showard12f3e322009-05-13 21:27:42 +0000101
102
mblighf3294cc2009-04-08 21:17:38 +0000103 def _check_for_db_inconsistencies(self):
showard01a51672009-05-29 18:42:37 +0000104 logging.info('Cleaning db inconsistencies')
105 self._check_all_invalid_related_objects()
mblighf3294cc2009-04-08 21:17:38 +0000106
showard01a51672009-05-29 18:42:37 +0000107
108 def _check_invalid_related_objects_one_way(self, first_model,
109 relation_field, second_model):
110 if 'invalid' not in first_model.get_field_dict():
111 return []
112 invalid_objects = list(first_model.objects.filter(invalid=True))
113 first_model.objects.populate_relationships(invalid_objects,
114 second_model,
115 'related_objects')
116 error_lines = []
117 for invalid_object in invalid_objects:
118 if invalid_object.related_objects:
119 related_list = ', '.join(str(related_object) for related_object
120 in invalid_object.related_objects)
121 error_lines.append('Invalid %s %s is related to %ss: %s'
122 % (first_model.__name__, invalid_object,
123 second_model.__name__, related_list))
124 related_manager = getattr(invalid_object, relation_field)
125 related_manager.clear()
126 return error_lines
127
128
129 def _check_invalid_related_objects(self, first_model, first_field,
130 second_model, second_field):
131 errors = self._check_invalid_related_objects_one_way(
132 first_model, first_field, second_model)
133 errors.extend(self._check_invalid_related_objects_one_way(
134 second_model, second_field, first_model))
135 return errors
136
137
138 def _check_all_invalid_related_objects(self):
139 model_pairs = ((models.Host, 'labels', models.Label, 'host_set'),
140 (models.AclGroup, 'hosts', models.Host, 'aclgroup_set'),
141 (models.AclGroup, 'users', models.User, 'aclgroup_set'),
142 (models.Test, 'dependency_labels', models.Label,
143 'test_set'))
144 errors = []
145 for first_model, first_field, second_model, second_field in model_pairs:
146 errors.extend(self._check_invalid_related_objects(
147 first_model, first_field, second_model, second_field))
148
149 if errors:
150 subject = ('%s relationships to invalid models, cleaned all' %
151 len(errors))
152 message = '\n'.join(errors)
153 logging.warning(subject)
154 logging.warning(message)
mblighf3294cc2009-04-08 21:17:38 +0000155 email_manager.manager.enqueue_notify_email(subject, message)
156
157
158 def _clear_inactive_blocks(self):
159 msg = 'Clear out blocks for all completed jobs.'
160 logging.info(msg)
161 # this would be simpler using NOT IN (subquery), but MySQL
162 # treats all IN subqueries as dependent, so this optimizes much
163 # better
164 self._db.execute("""
Dan Shi3762c6b2016-09-16 16:24:07 -0700165 DELETE ihq FROM afe_ineligible_host_queues ihq
166 WHERE NOT EXISTS
167 (SELECT job_id FROM afe_host_queue_entries hqe
168 WHERE NOT hqe.complete AND hqe.job_id = ihq.job_id)""")
mblighf3294cc2009-04-08 21:17:38 +0000169
170
showard8dbd05a2010-01-12 18:54:59 +0000171 def _should_reverify_hosts_now(self):
172 reverify_period_sec = (scheduler_config.config.reverify_period_minutes
173 * 60)
174 if reverify_period_sec == 0:
175 return False
176 return (self._last_reverify_time + reverify_period_sec) <= time.time()
177
178
Eric Lie0493a42010-11-15 13:05:43 -0800179 def _choose_subset_of_hosts_to_reverify(self, hosts):
180 """Given hosts needing verification, return a subset to reverify."""
181 max_at_once = scheduler_config.config.reverify_max_hosts_at_once
182 if (max_at_once > 0 and len(hosts) > max_at_once):
183 return random.sample(hosts, max_at_once)
184 return sorted(hosts)
185
186
showard8dbd05a2010-01-12 18:54:59 +0000187 def _reverify_dead_hosts(self):
188 if not self._should_reverify_hosts_now():
189 return
190
191 self._last_reverify_time = time.time()
192 logging.info('Checking for dead hosts to reverify')
193 hosts = models.Host.objects.filter(
194 status=models.Host.Status.REPAIR_FAILED,
195 locked=False,
196 invalid=False)
197 hosts = hosts.exclude(
198 protection=host_protections.Protection.DO_NOT_VERIFY)
199 if not hosts:
200 return
201
Eric Lie0493a42010-11-15 13:05:43 -0800202 hosts = list(hosts)
203 total_hosts = len(hosts)
204 hosts = self._choose_subset_of_hosts_to_reverify(hosts)
205 logging.info('Reverifying dead hosts (%d of %d) %s', len(hosts),
206 total_hosts, ', '.join(host.hostname for host in hosts))
showard8dbd05a2010-01-12 18:54:59 +0000207 for host in hosts:
showardbe030fb2010-01-15 00:21:20 +0000208 models.SpecialTask.schedule_special_task(
showard8dbd05a2010-01-12 18:54:59 +0000209 host=host, task=models.SpecialTask.Task.VERIFY)
210
211
Simran Basi742b81d2014-05-30 13:51:06 -0700212 def _django_session_cleanup(self):
213 """Clean up django_session since django doesn't for us.
214 http://www.djangoproject.com/documentation/0.96/sessions/
215 """
216 logging.info('Deleting old sessions from django_session')
217 sql = 'TRUNCATE TABLE django_session'
218 self._db.execute(sql)
219
220
mblighf3294cc2009-04-08 21:17:38 +0000221class TwentyFourHourUpkeep(PeriodicCleanup):
222 """Cleanup that runs at the startup of monitor_db and every subsequent
223 twenty four hours.
224 """
225
226
Dan Shic458f662015-04-29 12:12:38 -0700227 def __init__(self, db, drone_manager, run_at_initialize=True):
228 """Initialize TwentyFourHourUpkeep.
229
230 @param db: Database connection object.
231 @param drone_manager: DroneManager to access drones.
232 @param run_at_initialize: True to run cleanup when scheduler starts.
233 Default is set to True.
234
235 """
236 self.drone_manager = drone_manager
Alex Millerac189f32014-06-23 13:55:23 -0700237 clean_interval_minutes = 24 * 60 # 24 hours
mblighf3294cc2009-04-08 21:17:38 +0000238 super(TwentyFourHourUpkeep, self).__init__(
Alex Millerac189f32014-06-23 13:55:23 -0700239 db, clean_interval_minutes, run_at_initialize=run_at_initialize)
mblighf3294cc2009-04-08 21:17:38 +0000240
241
Aviv Keshetf2c9ac92016-09-16 15:44:26 -0700242 @metrics.SecondsTimerDecorator(
243 'chromeos/autotest/scheduler/cleanup/daily/durations')
mblighf3294cc2009-04-08 21:17:38 +0000244 def _cleanup(self):
245 logging.info('Running 24 hour clean up')
showard01a51672009-05-29 18:42:37 +0000246 self._check_for_uncleanable_db_inconsistencies()
Dan Shi55d58992015-05-05 09:10:02 -0700247 self._cleanup_orphaned_containers()
mblighf3294cc2009-04-08 21:17:38 +0000248
249
showard01a51672009-05-29 18:42:37 +0000250 def _check_for_uncleanable_db_inconsistencies(self):
251 logging.info('Checking for uncleanable DB inconsistencies')
252 self._check_for_active_and_complete_queue_entries()
253 self._check_for_multiple_platform_hosts()
254 self._check_for_no_platform_hosts()
showard6157c632009-07-06 20:19:31 +0000255 self._check_for_multiple_atomic_group_hosts()
showard01a51672009-05-29 18:42:37 +0000256
257
258 def _check_for_active_and_complete_queue_entries(self):
259 query = models.HostQueueEntry.objects.filter(active=True, complete=True)
260 if query.count() != 0:
261 subject = ('%d queue entries found with active=complete=1'
262 % query.count())
Simran Basi1c5b0572012-10-11 11:27:51 -0700263 lines = []
264 for entry in query:
265 lines.append(str(entry.get_object_dict()))
266 if entry.status == 'Aborted':
267 logging.error('Aborted entry: %s is both active and '
268 'complete. Setting active value to False.',
269 str(entry))
270 entry.active = False
271 entry.save()
showard01a51672009-05-29 18:42:37 +0000272 self._send_inconsistency_message(subject, lines)
273
274
275 def _check_for_multiple_platform_hosts(self):
276 rows = self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000277 SELECT afe_hosts.id, hostname, COUNT(1) AS platform_count,
278 GROUP_CONCAT(afe_labels.name)
279 FROM afe_hosts
280 INNER JOIN afe_hosts_labels ON
281 afe_hosts.id = afe_hosts_labels.host_id
282 INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
283 WHERE afe_labels.platform
284 GROUP BY afe_hosts.id
showard01a51672009-05-29 18:42:37 +0000285 HAVING platform_count > 1
286 ORDER BY hostname""")
287 if rows:
288 subject = '%s hosts with multiple platforms' % self._db.rowcount
289 lines = [' '.join(str(item) for item in row)
290 for row in rows]
291 self._send_inconsistency_message(subject, lines)
292
293
294 def _check_for_no_platform_hosts(self):
295 rows = self._db.execute("""
296 SELECT hostname
showardeab66ce2009-12-23 00:03:56 +0000297 FROM afe_hosts
298 LEFT JOIN afe_hosts_labels
299 ON afe_hosts.id = afe_hosts_labels.host_id
300 AND afe_hosts_labels.label_id IN (SELECT id FROM afe_labels
301 WHERE platform)
302 WHERE NOT afe_hosts.invalid AND afe_hosts_labels.host_id IS NULL""")
showard01a51672009-05-29 18:42:37 +0000303 if rows:
Ilja H. Friedel04be2bd2014-05-07 21:29:59 -0700304 logging.warning('%s hosts with no platform\n%s', self._db.rowcount,
jamesren675bfe72010-02-19 21:56:13 +0000305 ', '.join(row[0] for row in rows))
showard01a51672009-05-29 18:42:37 +0000306
307
showard6157c632009-07-06 20:19:31 +0000308 def _check_for_multiple_atomic_group_hosts(self):
309 rows = self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000310 SELECT afe_hosts.id, hostname,
311 COUNT(DISTINCT afe_atomic_groups.name) AS atomic_group_count,
312 GROUP_CONCAT(afe_labels.name),
313 GROUP_CONCAT(afe_atomic_groups.name)
314 FROM afe_hosts
315 INNER JOIN afe_hosts_labels ON
316 afe_hosts.id = afe_hosts_labels.host_id
317 INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
318 INNER JOIN afe_atomic_groups ON
319 afe_labels.atomic_group_id = afe_atomic_groups.id
320 WHERE NOT afe_hosts.invalid AND NOT afe_labels.invalid
321 GROUP BY afe_hosts.id
showard6157c632009-07-06 20:19:31 +0000322 HAVING atomic_group_count > 1
323 ORDER BY hostname""")
324 if rows:
325 subject = '%s hosts with multiple atomic groups' % self._db.rowcount
326 lines = [' '.join(str(item) for item in row)
327 for row in rows]
328 self._send_inconsistency_message(subject, lines)
329
330
showard01a51672009-05-29 18:42:37 +0000331 def _send_inconsistency_message(self, subject, lines):
332 logging.error(subject)
333 message = '\n'.join(lines)
334 if len(message) > 5000:
335 message = message[:5000] + '\n(truncated)\n'
336 email_manager.manager.enqueue_notify_email(subject, message)
Dan Shic458f662015-04-29 12:12:38 -0700337
338
Dan Shic458f662015-04-29 12:12:38 -0700339 def _cleanup_orphaned_containers(self):
340 """Cleanup orphaned containers in each drone.
341
342 The function queues a lxc_cleanup call in each drone without waiting for
343 the script to finish, as the cleanup procedure could take minutes and the
344 script output is logged.
345
346 """
347 ssp_enabled = global_config.global_config.get_config_value(
348 'AUTOSERV', 'enable_ssp_container')
349 if not ssp_enabled:
350 logging.info('Server-side packaging is not enabled, no need to clean'
351 ' up orphaned containers.')
352 return
353 self.drone_manager.cleanup_orphaned_containers()