blob: 1c1d78b2c77991a5cbf9404365500407ccedf545 [file] [log] [blame]
mblighf3294cc2009-04-08 21:17:38 +00001"""
2Autotest AFE Cleanup used by the scheduler
3"""
4
5
Michael Liangda8c60a2014-06-03 13:24:51 -07006import time, logging, random
7
mblighf3294cc2009-04-08 21:17:38 +00008from autotest_lib.frontend.afe import models
Dan Shic458f662015-04-29 12:12:38 -07009from autotest_lib.scheduler import email_manager
10from autotest_lib.scheduler import scheduler_config
11from autotest_lib.client.common_lib import global_config
showard8dbd05a2010-01-12 18:54:59 +000012from autotest_lib.client.common_lib import host_protections
Gabe Black1e1c41b2015-02-04 23:55:15 -080013from autotest_lib.client.common_lib.cros.graphite import autotest_stats
mblighf3294cc2009-04-08 21:17:38 +000014
mblighf3294cc2009-04-08 21:17:38 +000015
Dan Shic458f662015-04-29 12:12:38 -070016class PeriodicCleanup(object):
17 """Base class to schedule periodical cleanup work.
18 """
mblighf3294cc2009-04-08 21:17:38 +000019
Alex Millerac189f32014-06-23 13:55:23 -070020 def __init__(self, db, clean_interval_minutes, run_at_initialize=False):
mblighf3294cc2009-04-08 21:17:38 +000021 self._db = db
Alex Millerac189f32014-06-23 13:55:23 -070022 self.clean_interval_minutes = clean_interval_minutes
mblighf3294cc2009-04-08 21:17:38 +000023 self._last_clean_time = time.time()
showard915958d2009-04-22 21:00:58 +000024 self._run_at_initialize = run_at_initialize
25
26
27 def initialize(self):
Dan Shic458f662015-04-29 12:12:38 -070028 """Method called by scheduler at the startup.
29 """
showard915958d2009-04-22 21:00:58 +000030 if self._run_at_initialize:
mblighf3294cc2009-04-08 21:17:38 +000031 self._cleanup()
32
33
34 def run_cleanup_maybe(self):
Dan Shic458f662015-04-29 12:12:38 -070035 """Test if cleanup method should be called.
36 """
Alex Millerac189f32014-06-23 13:55:23 -070037 should_cleanup = (self._last_clean_time +
38 self.clean_interval_minutes * 60
mblighf3294cc2009-04-08 21:17:38 +000039 < time.time())
40 if should_cleanup:
41 self._cleanup()
42 self._last_clean_time = time.time()
43
44
45 def _cleanup(self):
46 """Abrstract cleanup method."""
47 raise NotImplementedError
48
49
50class UserCleanup(PeriodicCleanup):
51 """User cleanup that is controlled by the global config variable
Alex Millerac189f32014-06-23 13:55:23 -070052 clean_interval_minutes in the SCHEDULER section.
mblighf3294cc2009-04-08 21:17:38 +000053 """
Gabe Black1e1c41b2015-02-04 23:55:15 -080054 timer = autotest_stats.Timer('monitor_db_cleanup.user_cleanup')
mblighf3294cc2009-04-08 21:17:38 +000055
56
57 def __init__(self, db, clean_interval_minutes):
58 super(UserCleanup, self).__init__(db, clean_interval_minutes)
showard8dbd05a2010-01-12 18:54:59 +000059 self._last_reverify_time = time.time()
mblighf3294cc2009-04-08 21:17:38 +000060
61
Alex Milleree632912013-10-08 16:03:12 -070062 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +000063 def _cleanup(self):
mbligh1ef218d2009-08-03 16:57:56 +000064 logging.info('Running periodic cleanup')
65 self._abort_timed_out_jobs()
mbligh1ef218d2009-08-03 16:57:56 +000066 self._abort_jobs_past_max_runtime()
67 self._clear_inactive_blocks()
68 self._check_for_db_inconsistencies()
showard8dbd05a2010-01-12 18:54:59 +000069 self._reverify_dead_hosts()
Simran Basi742b81d2014-05-30 13:51:06 -070070 self._django_session_cleanup()
mblighf3294cc2009-04-08 21:17:38 +000071
72
Alex Milleree632912013-10-08 16:03:12 -070073 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +000074 def _abort_timed_out_jobs(self):
75 msg = 'Aborting all jobs that have timed out and are not complete'
76 logging.info(msg)
77 query = models.Job.objects.filter(hostqueueentry__complete=False).extra(
Simran Basi7e605742013-11-12 13:43:36 -080078 where=['created_on + INTERVAL timeout_mins MINUTE < NOW()'])
mblighf3294cc2009-04-08 21:17:38 +000079 for job in query.distinct():
80 logging.warning('Aborting job %d due to job timeout', job.id)
showard64a95952010-01-13 21:27:16 +000081 job.abort()
mblighf3294cc2009-04-08 21:17:38 +000082
83
Alex Milleree632912013-10-08 16:03:12 -070084 @timer.decorate
showard12f3e322009-05-13 21:27:42 +000085 def _abort_jobs_past_max_runtime(self):
86 """
87 Abort executions that have started and are past the job's max runtime.
88 """
89 logging.info('Aborting all jobs that have passed maximum runtime')
90 rows = self._db.execute("""
Dan Shi0a1bb172016-09-17 21:50:17 -070091 SELECT hqe.id FROM afe_host_queue_entries AS hqe
92 WHERE NOT hqe.complete AND NOT hqe.aborted AND EXISTS
93 (select * from afe_jobs where hqe.job_id=afe_jobs.id and
94 hqe.started_on + INTERVAL afe_jobs.max_runtime_mins MINUTE < NOW())
95 """)
showard12f3e322009-05-13 21:27:42 +000096 query = models.HostQueueEntry.objects.filter(
97 id__in=[row[0] for row in rows])
98 for queue_entry in query.distinct():
99 logging.warning('Aborting entry %s due to max runtime', queue_entry)
showard64a95952010-01-13 21:27:16 +0000100 queue_entry.abort()
showard12f3e322009-05-13 21:27:42 +0000101
102
Alex Milleree632912013-10-08 16:03:12 -0700103 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +0000104 def _check_for_db_inconsistencies(self):
showard01a51672009-05-29 18:42:37 +0000105 logging.info('Cleaning db inconsistencies')
106 self._check_all_invalid_related_objects()
mblighf3294cc2009-04-08 21:17:38 +0000107
showard01a51672009-05-29 18:42:37 +0000108
109 def _check_invalid_related_objects_one_way(self, first_model,
110 relation_field, second_model):
111 if 'invalid' not in first_model.get_field_dict():
112 return []
113 invalid_objects = list(first_model.objects.filter(invalid=True))
114 first_model.objects.populate_relationships(invalid_objects,
115 second_model,
116 'related_objects')
117 error_lines = []
118 for invalid_object in invalid_objects:
119 if invalid_object.related_objects:
120 related_list = ', '.join(str(related_object) for related_object
121 in invalid_object.related_objects)
122 error_lines.append('Invalid %s %s is related to %ss: %s'
123 % (first_model.__name__, invalid_object,
124 second_model.__name__, related_list))
125 related_manager = getattr(invalid_object, relation_field)
126 related_manager.clear()
127 return error_lines
128
129
130 def _check_invalid_related_objects(self, first_model, first_field,
131 second_model, second_field):
132 errors = self._check_invalid_related_objects_one_way(
133 first_model, first_field, second_model)
134 errors.extend(self._check_invalid_related_objects_one_way(
135 second_model, second_field, first_model))
136 return errors
137
138
139 def _check_all_invalid_related_objects(self):
140 model_pairs = ((models.Host, 'labels', models.Label, 'host_set'),
141 (models.AclGroup, 'hosts', models.Host, 'aclgroup_set'),
142 (models.AclGroup, 'users', models.User, 'aclgroup_set'),
143 (models.Test, 'dependency_labels', models.Label,
144 'test_set'))
145 errors = []
146 for first_model, first_field, second_model, second_field in model_pairs:
147 errors.extend(self._check_invalid_related_objects(
148 first_model, first_field, second_model, second_field))
149
150 if errors:
151 subject = ('%s relationships to invalid models, cleaned all' %
152 len(errors))
153 message = '\n'.join(errors)
154 logging.warning(subject)
155 logging.warning(message)
mblighf3294cc2009-04-08 21:17:38 +0000156 email_manager.manager.enqueue_notify_email(subject, message)
157
158
Alex Milleree632912013-10-08 16:03:12 -0700159 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +0000160 def _clear_inactive_blocks(self):
161 msg = 'Clear out blocks for all completed jobs.'
162 logging.info(msg)
163 # this would be simpler using NOT IN (subquery), but MySQL
164 # treats all IN subqueries as dependent, so this optimizes much
165 # better
166 self._db.execute("""
Dan Shi3762c6b2016-09-16 16:24:07 -0700167 DELETE ihq FROM afe_ineligible_host_queues ihq
168 WHERE NOT EXISTS
169 (SELECT job_id FROM afe_host_queue_entries hqe
170 WHERE NOT hqe.complete AND hqe.job_id = ihq.job_id)""")
mblighf3294cc2009-04-08 21:17:38 +0000171
172
showard8dbd05a2010-01-12 18:54:59 +0000173 def _should_reverify_hosts_now(self):
174 reverify_period_sec = (scheduler_config.config.reverify_period_minutes
175 * 60)
176 if reverify_period_sec == 0:
177 return False
178 return (self._last_reverify_time + reverify_period_sec) <= time.time()
179
180
Eric Lie0493a42010-11-15 13:05:43 -0800181 def _choose_subset_of_hosts_to_reverify(self, hosts):
182 """Given hosts needing verification, return a subset to reverify."""
183 max_at_once = scheduler_config.config.reverify_max_hosts_at_once
184 if (max_at_once > 0 and len(hosts) > max_at_once):
185 return random.sample(hosts, max_at_once)
186 return sorted(hosts)
187
188
Alex Milleree632912013-10-08 16:03:12 -0700189 @timer.decorate
showard8dbd05a2010-01-12 18:54:59 +0000190 def _reverify_dead_hosts(self):
191 if not self._should_reverify_hosts_now():
192 return
193
194 self._last_reverify_time = time.time()
195 logging.info('Checking for dead hosts to reverify')
196 hosts = models.Host.objects.filter(
197 status=models.Host.Status.REPAIR_FAILED,
198 locked=False,
199 invalid=False)
200 hosts = hosts.exclude(
201 protection=host_protections.Protection.DO_NOT_VERIFY)
202 if not hosts:
203 return
204
Eric Lie0493a42010-11-15 13:05:43 -0800205 hosts = list(hosts)
206 total_hosts = len(hosts)
207 hosts = self._choose_subset_of_hosts_to_reverify(hosts)
208 logging.info('Reverifying dead hosts (%d of %d) %s', len(hosts),
209 total_hosts, ', '.join(host.hostname for host in hosts))
showard8dbd05a2010-01-12 18:54:59 +0000210 for host in hosts:
showardbe030fb2010-01-15 00:21:20 +0000211 models.SpecialTask.schedule_special_task(
showard8dbd05a2010-01-12 18:54:59 +0000212 host=host, task=models.SpecialTask.Task.VERIFY)
213
214
Simran Basi742b81d2014-05-30 13:51:06 -0700215 @timer.decorate
216 def _django_session_cleanup(self):
217 """Clean up django_session since django doesn't for us.
218 http://www.djangoproject.com/documentation/0.96/sessions/
219 """
220 logging.info('Deleting old sessions from django_session')
221 sql = 'TRUNCATE TABLE django_session'
222 self._db.execute(sql)
223
224
mblighf3294cc2009-04-08 21:17:38 +0000225class TwentyFourHourUpkeep(PeriodicCleanup):
226 """Cleanup that runs at the startup of monitor_db and every subsequent
227 twenty four hours.
228 """
Gabe Black1e1c41b2015-02-04 23:55:15 -0800229 timer = autotest_stats.Timer('monitor_db_cleanup.twentyfourhour_cleanup')
mblighf3294cc2009-04-08 21:17:38 +0000230
231
Dan Shic458f662015-04-29 12:12:38 -0700232 def __init__(self, db, drone_manager, run_at_initialize=True):
233 """Initialize TwentyFourHourUpkeep.
234
235 @param db: Database connection object.
236 @param drone_manager: DroneManager to access drones.
237 @param run_at_initialize: True to run cleanup when scheduler starts.
238 Default is set to True.
239
240 """
241 self.drone_manager = drone_manager
Alex Millerac189f32014-06-23 13:55:23 -0700242 clean_interval_minutes = 24 * 60 # 24 hours
mblighf3294cc2009-04-08 21:17:38 +0000243 super(TwentyFourHourUpkeep, self).__init__(
Alex Millerac189f32014-06-23 13:55:23 -0700244 db, clean_interval_minutes, run_at_initialize=run_at_initialize)
mblighf3294cc2009-04-08 21:17:38 +0000245
246
Alex Milleree632912013-10-08 16:03:12 -0700247 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +0000248 def _cleanup(self):
249 logging.info('Running 24 hour clean up')
showard01a51672009-05-29 18:42:37 +0000250 self._check_for_uncleanable_db_inconsistencies()
Dan Shi55d58992015-05-05 09:10:02 -0700251 self._cleanup_orphaned_containers()
mblighf3294cc2009-04-08 21:17:38 +0000252
253
Alex Milleree632912013-10-08 16:03:12 -0700254 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000255 def _check_for_uncleanable_db_inconsistencies(self):
256 logging.info('Checking for uncleanable DB inconsistencies')
257 self._check_for_active_and_complete_queue_entries()
258 self._check_for_multiple_platform_hosts()
259 self._check_for_no_platform_hosts()
showard6157c632009-07-06 20:19:31 +0000260 self._check_for_multiple_atomic_group_hosts()
showard01a51672009-05-29 18:42:37 +0000261
262
Alex Milleree632912013-10-08 16:03:12 -0700263 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000264 def _check_for_active_and_complete_queue_entries(self):
265 query = models.HostQueueEntry.objects.filter(active=True, complete=True)
266 if query.count() != 0:
267 subject = ('%d queue entries found with active=complete=1'
268 % query.count())
Simran Basi1c5b0572012-10-11 11:27:51 -0700269 lines = []
270 for entry in query:
271 lines.append(str(entry.get_object_dict()))
272 if entry.status == 'Aborted':
273 logging.error('Aborted entry: %s is both active and '
274 'complete. Setting active value to False.',
275 str(entry))
276 entry.active = False
277 entry.save()
showard01a51672009-05-29 18:42:37 +0000278 self._send_inconsistency_message(subject, lines)
279
280
Alex Milleree632912013-10-08 16:03:12 -0700281 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000282 def _check_for_multiple_platform_hosts(self):
283 rows = self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000284 SELECT afe_hosts.id, hostname, COUNT(1) AS platform_count,
285 GROUP_CONCAT(afe_labels.name)
286 FROM afe_hosts
287 INNER JOIN afe_hosts_labels ON
288 afe_hosts.id = afe_hosts_labels.host_id
289 INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
290 WHERE afe_labels.platform
291 GROUP BY afe_hosts.id
showard01a51672009-05-29 18:42:37 +0000292 HAVING platform_count > 1
293 ORDER BY hostname""")
294 if rows:
295 subject = '%s hosts with multiple platforms' % self._db.rowcount
296 lines = [' '.join(str(item) for item in row)
297 for row in rows]
298 self._send_inconsistency_message(subject, lines)
299
300
Alex Milleree632912013-10-08 16:03:12 -0700301 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000302 def _check_for_no_platform_hosts(self):
303 rows = self._db.execute("""
304 SELECT hostname
showardeab66ce2009-12-23 00:03:56 +0000305 FROM afe_hosts
306 LEFT JOIN afe_hosts_labels
307 ON afe_hosts.id = afe_hosts_labels.host_id
308 AND afe_hosts_labels.label_id IN (SELECT id FROM afe_labels
309 WHERE platform)
310 WHERE NOT afe_hosts.invalid AND afe_hosts_labels.host_id IS NULL""")
showard01a51672009-05-29 18:42:37 +0000311 if rows:
Ilja H. Friedel04be2bd2014-05-07 21:29:59 -0700312 logging.warning('%s hosts with no platform\n%s', self._db.rowcount,
jamesren675bfe72010-02-19 21:56:13 +0000313 ', '.join(row[0] for row in rows))
showard01a51672009-05-29 18:42:37 +0000314
315
Alex Milleree632912013-10-08 16:03:12 -0700316 @timer.decorate
showard6157c632009-07-06 20:19:31 +0000317 def _check_for_multiple_atomic_group_hosts(self):
318 rows = self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000319 SELECT afe_hosts.id, hostname,
320 COUNT(DISTINCT afe_atomic_groups.name) AS atomic_group_count,
321 GROUP_CONCAT(afe_labels.name),
322 GROUP_CONCAT(afe_atomic_groups.name)
323 FROM afe_hosts
324 INNER JOIN afe_hosts_labels ON
325 afe_hosts.id = afe_hosts_labels.host_id
326 INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
327 INNER JOIN afe_atomic_groups ON
328 afe_labels.atomic_group_id = afe_atomic_groups.id
329 WHERE NOT afe_hosts.invalid AND NOT afe_labels.invalid
330 GROUP BY afe_hosts.id
showard6157c632009-07-06 20:19:31 +0000331 HAVING atomic_group_count > 1
332 ORDER BY hostname""")
333 if rows:
334 subject = '%s hosts with multiple atomic groups' % self._db.rowcount
335 lines = [' '.join(str(item) for item in row)
336 for row in rows]
337 self._send_inconsistency_message(subject, lines)
338
339
showard01a51672009-05-29 18:42:37 +0000340 def _send_inconsistency_message(self, subject, lines):
341 logging.error(subject)
342 message = '\n'.join(lines)
343 if len(message) > 5000:
344 message = message[:5000] + '\n(truncated)\n'
345 email_manager.manager.enqueue_notify_email(subject, message)
Dan Shic458f662015-04-29 12:12:38 -0700346
347
348 @timer.decorate
349 def _cleanup_orphaned_containers(self):
350 """Cleanup orphaned containers in each drone.
351
352 The function queues a lxc_cleanup call in each drone without waiting for
353 the script to finish, as the cleanup procedure could take minutes and the
354 script output is logged.
355
356 """
357 ssp_enabled = global_config.global_config.get_config_value(
358 'AUTOSERV', 'enable_ssp_container')
359 if not ssp_enabled:
360 logging.info('Server-side packaging is not enabled, no need to clean'
361 ' up orphaned containers.')
362 return
363 self.drone_manager.cleanup_orphaned_containers()