blob: bf455d6eccdab0e7f6991f811d1a1c9b0ebbf8bc [file] [log] [blame]
mblighf3294cc2009-04-08 21:17:38 +00001"""
2Autotest AFE Cleanup used by the scheduler
3"""
4
5
Aviv Keshetf2c9ac92016-09-16 15:44:26 -07006import logging
7import random
8import time
9
10from chromite.lib import metrics
Michael Liangda8c60a2014-06-03 13:24:51 -070011
mblighf3294cc2009-04-08 21:17:38 +000012from autotest_lib.frontend.afe import models
Dan Shic458f662015-04-29 12:12:38 -070013from autotest_lib.scheduler import email_manager
14from autotest_lib.scheduler import scheduler_config
15from autotest_lib.client.common_lib import global_config
showard8dbd05a2010-01-12 18:54:59 +000016from autotest_lib.client.common_lib import host_protections
Gabe Black1e1c41b2015-02-04 23:55:15 -080017from autotest_lib.client.common_lib.cros.graphite import autotest_stats
mblighf3294cc2009-04-08 21:17:38 +000018
mblighf3294cc2009-04-08 21:17:38 +000019
Dan Shic458f662015-04-29 12:12:38 -070020class PeriodicCleanup(object):
21 """Base class to schedule periodical cleanup work.
22 """
mblighf3294cc2009-04-08 21:17:38 +000023
Alex Millerac189f32014-06-23 13:55:23 -070024 def __init__(self, db, clean_interval_minutes, run_at_initialize=False):
mblighf3294cc2009-04-08 21:17:38 +000025 self._db = db
Alex Millerac189f32014-06-23 13:55:23 -070026 self.clean_interval_minutes = clean_interval_minutes
mblighf3294cc2009-04-08 21:17:38 +000027 self._last_clean_time = time.time()
showard915958d2009-04-22 21:00:58 +000028 self._run_at_initialize = run_at_initialize
29
30
31 def initialize(self):
Dan Shic458f662015-04-29 12:12:38 -070032 """Method called by scheduler at the startup.
33 """
showard915958d2009-04-22 21:00:58 +000034 if self._run_at_initialize:
mblighf3294cc2009-04-08 21:17:38 +000035 self._cleanup()
36
37
38 def run_cleanup_maybe(self):
Dan Shic458f662015-04-29 12:12:38 -070039 """Test if cleanup method should be called.
40 """
Alex Millerac189f32014-06-23 13:55:23 -070041 should_cleanup = (self._last_clean_time +
42 self.clean_interval_minutes * 60
mblighf3294cc2009-04-08 21:17:38 +000043 < time.time())
44 if should_cleanup:
45 self._cleanup()
46 self._last_clean_time = time.time()
47
48
49 def _cleanup(self):
50 """Abrstract cleanup method."""
51 raise NotImplementedError
52
53
54class UserCleanup(PeriodicCleanup):
55 """User cleanup that is controlled by the global config variable
Alex Millerac189f32014-06-23 13:55:23 -070056 clean_interval_minutes in the SCHEDULER section.
mblighf3294cc2009-04-08 21:17:38 +000057 """
Gabe Black1e1c41b2015-02-04 23:55:15 -080058 timer = autotest_stats.Timer('monitor_db_cleanup.user_cleanup')
mblighf3294cc2009-04-08 21:17:38 +000059
60
61 def __init__(self, db, clean_interval_minutes):
62 super(UserCleanup, self).__init__(db, clean_interval_minutes)
showard8dbd05a2010-01-12 18:54:59 +000063 self._last_reverify_time = time.time()
mblighf3294cc2009-04-08 21:17:38 +000064
65
Alex Milleree632912013-10-08 16:03:12 -070066 @timer.decorate
Aviv Keshetf2c9ac92016-09-16 15:44:26 -070067 @metrics.SecondsTimerDecorator(
68 'chromeos/autotest/scheduler/cleanup/user/durations')
mblighf3294cc2009-04-08 21:17:38 +000069 def _cleanup(self):
mbligh1ef218d2009-08-03 16:57:56 +000070 logging.info('Running periodic cleanup')
71 self._abort_timed_out_jobs()
mbligh1ef218d2009-08-03 16:57:56 +000072 self._abort_jobs_past_max_runtime()
73 self._clear_inactive_blocks()
74 self._check_for_db_inconsistencies()
showard8dbd05a2010-01-12 18:54:59 +000075 self._reverify_dead_hosts()
Simran Basi742b81d2014-05-30 13:51:06 -070076 self._django_session_cleanup()
mblighf3294cc2009-04-08 21:17:38 +000077
78
Alex Milleree632912013-10-08 16:03:12 -070079 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +000080 def _abort_timed_out_jobs(self):
81 msg = 'Aborting all jobs that have timed out and are not complete'
82 logging.info(msg)
83 query = models.Job.objects.filter(hostqueueentry__complete=False).extra(
Simran Basi7e605742013-11-12 13:43:36 -080084 where=['created_on + INTERVAL timeout_mins MINUTE < NOW()'])
mblighf3294cc2009-04-08 21:17:38 +000085 for job in query.distinct():
86 logging.warning('Aborting job %d due to job timeout', job.id)
showard64a95952010-01-13 21:27:16 +000087 job.abort()
mblighf3294cc2009-04-08 21:17:38 +000088
89
Alex Milleree632912013-10-08 16:03:12 -070090 @timer.decorate
showard12f3e322009-05-13 21:27:42 +000091 def _abort_jobs_past_max_runtime(self):
92 """
93 Abort executions that have started and are past the job's max runtime.
94 """
95 logging.info('Aborting all jobs that have passed maximum runtime')
96 rows = self._db.execute("""
Dan Shi0a1bb172016-09-17 21:50:17 -070097 SELECT hqe.id FROM afe_host_queue_entries AS hqe
98 WHERE NOT hqe.complete AND NOT hqe.aborted AND EXISTS
99 (select * from afe_jobs where hqe.job_id=afe_jobs.id and
100 hqe.started_on + INTERVAL afe_jobs.max_runtime_mins MINUTE < NOW())
101 """)
showard12f3e322009-05-13 21:27:42 +0000102 query = models.HostQueueEntry.objects.filter(
103 id__in=[row[0] for row in rows])
104 for queue_entry in query.distinct():
105 logging.warning('Aborting entry %s due to max runtime', queue_entry)
showard64a95952010-01-13 21:27:16 +0000106 queue_entry.abort()
showard12f3e322009-05-13 21:27:42 +0000107
108
Alex Milleree632912013-10-08 16:03:12 -0700109 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +0000110 def _check_for_db_inconsistencies(self):
showard01a51672009-05-29 18:42:37 +0000111 logging.info('Cleaning db inconsistencies')
112 self._check_all_invalid_related_objects()
mblighf3294cc2009-04-08 21:17:38 +0000113
showard01a51672009-05-29 18:42:37 +0000114
115 def _check_invalid_related_objects_one_way(self, first_model,
116 relation_field, second_model):
117 if 'invalid' not in first_model.get_field_dict():
118 return []
119 invalid_objects = list(first_model.objects.filter(invalid=True))
120 first_model.objects.populate_relationships(invalid_objects,
121 second_model,
122 'related_objects')
123 error_lines = []
124 for invalid_object in invalid_objects:
125 if invalid_object.related_objects:
126 related_list = ', '.join(str(related_object) for related_object
127 in invalid_object.related_objects)
128 error_lines.append('Invalid %s %s is related to %ss: %s'
129 % (first_model.__name__, invalid_object,
130 second_model.__name__, related_list))
131 related_manager = getattr(invalid_object, relation_field)
132 related_manager.clear()
133 return error_lines
134
135
136 def _check_invalid_related_objects(self, first_model, first_field,
137 second_model, second_field):
138 errors = self._check_invalid_related_objects_one_way(
139 first_model, first_field, second_model)
140 errors.extend(self._check_invalid_related_objects_one_way(
141 second_model, second_field, first_model))
142 return errors
143
144
145 def _check_all_invalid_related_objects(self):
146 model_pairs = ((models.Host, 'labels', models.Label, 'host_set'),
147 (models.AclGroup, 'hosts', models.Host, 'aclgroup_set'),
148 (models.AclGroup, 'users', models.User, 'aclgroup_set'),
149 (models.Test, 'dependency_labels', models.Label,
150 'test_set'))
151 errors = []
152 for first_model, first_field, second_model, second_field in model_pairs:
153 errors.extend(self._check_invalid_related_objects(
154 first_model, first_field, second_model, second_field))
155
156 if errors:
157 subject = ('%s relationships to invalid models, cleaned all' %
158 len(errors))
159 message = '\n'.join(errors)
160 logging.warning(subject)
161 logging.warning(message)
mblighf3294cc2009-04-08 21:17:38 +0000162 email_manager.manager.enqueue_notify_email(subject, message)
163
164
Alex Milleree632912013-10-08 16:03:12 -0700165 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +0000166 def _clear_inactive_blocks(self):
167 msg = 'Clear out blocks for all completed jobs.'
168 logging.info(msg)
169 # this would be simpler using NOT IN (subquery), but MySQL
170 # treats all IN subqueries as dependent, so this optimizes much
171 # better
172 self._db.execute("""
Dan Shi3762c6b2016-09-16 16:24:07 -0700173 DELETE ihq FROM afe_ineligible_host_queues ihq
174 WHERE NOT EXISTS
175 (SELECT job_id FROM afe_host_queue_entries hqe
176 WHERE NOT hqe.complete AND hqe.job_id = ihq.job_id)""")
mblighf3294cc2009-04-08 21:17:38 +0000177
178
showard8dbd05a2010-01-12 18:54:59 +0000179 def _should_reverify_hosts_now(self):
180 reverify_period_sec = (scheduler_config.config.reverify_period_minutes
181 * 60)
182 if reverify_period_sec == 0:
183 return False
184 return (self._last_reverify_time + reverify_period_sec) <= time.time()
185
186
Eric Lie0493a42010-11-15 13:05:43 -0800187 def _choose_subset_of_hosts_to_reverify(self, hosts):
188 """Given hosts needing verification, return a subset to reverify."""
189 max_at_once = scheduler_config.config.reverify_max_hosts_at_once
190 if (max_at_once > 0 and len(hosts) > max_at_once):
191 return random.sample(hosts, max_at_once)
192 return sorted(hosts)
193
194
Alex Milleree632912013-10-08 16:03:12 -0700195 @timer.decorate
showard8dbd05a2010-01-12 18:54:59 +0000196 def _reverify_dead_hosts(self):
197 if not self._should_reverify_hosts_now():
198 return
199
200 self._last_reverify_time = time.time()
201 logging.info('Checking for dead hosts to reverify')
202 hosts = models.Host.objects.filter(
203 status=models.Host.Status.REPAIR_FAILED,
204 locked=False,
205 invalid=False)
206 hosts = hosts.exclude(
207 protection=host_protections.Protection.DO_NOT_VERIFY)
208 if not hosts:
209 return
210
Eric Lie0493a42010-11-15 13:05:43 -0800211 hosts = list(hosts)
212 total_hosts = len(hosts)
213 hosts = self._choose_subset_of_hosts_to_reverify(hosts)
214 logging.info('Reverifying dead hosts (%d of %d) %s', len(hosts),
215 total_hosts, ', '.join(host.hostname for host in hosts))
showard8dbd05a2010-01-12 18:54:59 +0000216 for host in hosts:
showardbe030fb2010-01-15 00:21:20 +0000217 models.SpecialTask.schedule_special_task(
showard8dbd05a2010-01-12 18:54:59 +0000218 host=host, task=models.SpecialTask.Task.VERIFY)
219
220
Simran Basi742b81d2014-05-30 13:51:06 -0700221 @timer.decorate
222 def _django_session_cleanup(self):
223 """Clean up django_session since django doesn't for us.
224 http://www.djangoproject.com/documentation/0.96/sessions/
225 """
226 logging.info('Deleting old sessions from django_session')
227 sql = 'TRUNCATE TABLE django_session'
228 self._db.execute(sql)
229
230
mblighf3294cc2009-04-08 21:17:38 +0000231class TwentyFourHourUpkeep(PeriodicCleanup):
232 """Cleanup that runs at the startup of monitor_db and every subsequent
233 twenty four hours.
234 """
Gabe Black1e1c41b2015-02-04 23:55:15 -0800235 timer = autotest_stats.Timer('monitor_db_cleanup.twentyfourhour_cleanup')
mblighf3294cc2009-04-08 21:17:38 +0000236
237
Dan Shic458f662015-04-29 12:12:38 -0700238 def __init__(self, db, drone_manager, run_at_initialize=True):
239 """Initialize TwentyFourHourUpkeep.
240
241 @param db: Database connection object.
242 @param drone_manager: DroneManager to access drones.
243 @param run_at_initialize: True to run cleanup when scheduler starts.
244 Default is set to True.
245
246 """
247 self.drone_manager = drone_manager
Alex Millerac189f32014-06-23 13:55:23 -0700248 clean_interval_minutes = 24 * 60 # 24 hours
mblighf3294cc2009-04-08 21:17:38 +0000249 super(TwentyFourHourUpkeep, self).__init__(
Alex Millerac189f32014-06-23 13:55:23 -0700250 db, clean_interval_minutes, run_at_initialize=run_at_initialize)
mblighf3294cc2009-04-08 21:17:38 +0000251
252
Alex Milleree632912013-10-08 16:03:12 -0700253 @timer.decorate
Aviv Keshetf2c9ac92016-09-16 15:44:26 -0700254 @metrics.SecondsTimerDecorator(
255 'chromeos/autotest/scheduler/cleanup/daily/durations')
mblighf3294cc2009-04-08 21:17:38 +0000256 def _cleanup(self):
257 logging.info('Running 24 hour clean up')
showard01a51672009-05-29 18:42:37 +0000258 self._check_for_uncleanable_db_inconsistencies()
Dan Shi55d58992015-05-05 09:10:02 -0700259 self._cleanup_orphaned_containers()
mblighf3294cc2009-04-08 21:17:38 +0000260
261
Alex Milleree632912013-10-08 16:03:12 -0700262 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000263 def _check_for_uncleanable_db_inconsistencies(self):
264 logging.info('Checking for uncleanable DB inconsistencies')
265 self._check_for_active_and_complete_queue_entries()
266 self._check_for_multiple_platform_hosts()
267 self._check_for_no_platform_hosts()
showard6157c632009-07-06 20:19:31 +0000268 self._check_for_multiple_atomic_group_hosts()
showard01a51672009-05-29 18:42:37 +0000269
270
Alex Milleree632912013-10-08 16:03:12 -0700271 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000272 def _check_for_active_and_complete_queue_entries(self):
273 query = models.HostQueueEntry.objects.filter(active=True, complete=True)
274 if query.count() != 0:
275 subject = ('%d queue entries found with active=complete=1'
276 % query.count())
Simran Basi1c5b0572012-10-11 11:27:51 -0700277 lines = []
278 for entry in query:
279 lines.append(str(entry.get_object_dict()))
280 if entry.status == 'Aborted':
281 logging.error('Aborted entry: %s is both active and '
282 'complete. Setting active value to False.',
283 str(entry))
284 entry.active = False
285 entry.save()
showard01a51672009-05-29 18:42:37 +0000286 self._send_inconsistency_message(subject, lines)
287
288
Alex Milleree632912013-10-08 16:03:12 -0700289 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000290 def _check_for_multiple_platform_hosts(self):
291 rows = self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000292 SELECT afe_hosts.id, hostname, COUNT(1) AS platform_count,
293 GROUP_CONCAT(afe_labels.name)
294 FROM afe_hosts
295 INNER JOIN afe_hosts_labels ON
296 afe_hosts.id = afe_hosts_labels.host_id
297 INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
298 WHERE afe_labels.platform
299 GROUP BY afe_hosts.id
showard01a51672009-05-29 18:42:37 +0000300 HAVING platform_count > 1
301 ORDER BY hostname""")
302 if rows:
303 subject = '%s hosts with multiple platforms' % self._db.rowcount
304 lines = [' '.join(str(item) for item in row)
305 for row in rows]
306 self._send_inconsistency_message(subject, lines)
307
308
Alex Milleree632912013-10-08 16:03:12 -0700309 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000310 def _check_for_no_platform_hosts(self):
311 rows = self._db.execute("""
312 SELECT hostname
showardeab66ce2009-12-23 00:03:56 +0000313 FROM afe_hosts
314 LEFT JOIN afe_hosts_labels
315 ON afe_hosts.id = afe_hosts_labels.host_id
316 AND afe_hosts_labels.label_id IN (SELECT id FROM afe_labels
317 WHERE platform)
318 WHERE NOT afe_hosts.invalid AND afe_hosts_labels.host_id IS NULL""")
showard01a51672009-05-29 18:42:37 +0000319 if rows:
Ilja H. Friedel04be2bd2014-05-07 21:29:59 -0700320 logging.warning('%s hosts with no platform\n%s', self._db.rowcount,
jamesren675bfe72010-02-19 21:56:13 +0000321 ', '.join(row[0] for row in rows))
showard01a51672009-05-29 18:42:37 +0000322
323
Alex Milleree632912013-10-08 16:03:12 -0700324 @timer.decorate
showard6157c632009-07-06 20:19:31 +0000325 def _check_for_multiple_atomic_group_hosts(self):
326 rows = self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000327 SELECT afe_hosts.id, hostname,
328 COUNT(DISTINCT afe_atomic_groups.name) AS atomic_group_count,
329 GROUP_CONCAT(afe_labels.name),
330 GROUP_CONCAT(afe_atomic_groups.name)
331 FROM afe_hosts
332 INNER JOIN afe_hosts_labels ON
333 afe_hosts.id = afe_hosts_labels.host_id
334 INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
335 INNER JOIN afe_atomic_groups ON
336 afe_labels.atomic_group_id = afe_atomic_groups.id
337 WHERE NOT afe_hosts.invalid AND NOT afe_labels.invalid
338 GROUP BY afe_hosts.id
showard6157c632009-07-06 20:19:31 +0000339 HAVING atomic_group_count > 1
340 ORDER BY hostname""")
341 if rows:
342 subject = '%s hosts with multiple atomic groups' % self._db.rowcount
343 lines = [' '.join(str(item) for item in row)
344 for row in rows]
345 self._send_inconsistency_message(subject, lines)
346
347
showard01a51672009-05-29 18:42:37 +0000348 def _send_inconsistency_message(self, subject, lines):
349 logging.error(subject)
350 message = '\n'.join(lines)
351 if len(message) > 5000:
352 message = message[:5000] + '\n(truncated)\n'
353 email_manager.manager.enqueue_notify_email(subject, message)
Dan Shic458f662015-04-29 12:12:38 -0700354
355
356 @timer.decorate
357 def _cleanup_orphaned_containers(self):
358 """Cleanup orphaned containers in each drone.
359
360 The function queues a lxc_cleanup call in each drone without waiting for
361 the script to finish, as the cleanup procedure could take minutes and the
362 script output is logged.
363
364 """
365 ssp_enabled = global_config.global_config.get_config_value(
366 'AUTOSERV', 'enable_ssp_container')
367 if not ssp_enabled:
368 logging.info('Server-side packaging is not enabled, no need to clean'
369 ' up orphaned containers.')
370 return
371 self.drone_manager.cleanup_orphaned_containers()