blob: 5c1ae874df593830d001aa530fd38e438ce1cac8 [file] [log] [blame]
mblighf3294cc2009-04-08 21:17:38 +00001"""
2Autotest AFE Cleanup used by the scheduler
3"""
4
5
Aviv Keshetf2c9ac92016-09-16 15:44:26 -07006import logging
7import random
8import time
9
Dan Shi5e2efb72017-02-07 11:40:23 -080010from autotest_lib.client.common_lib import utils
mblighf3294cc2009-04-08 21:17:38 +000011from autotest_lib.frontend.afe import models
Dan Shic458f662015-04-29 12:12:38 -070012from autotest_lib.scheduler import email_manager
13from autotest_lib.scheduler import scheduler_config
14from autotest_lib.client.common_lib import global_config
showard8dbd05a2010-01-12 18:54:59 +000015from autotest_lib.client.common_lib import host_protections
mblighf3294cc2009-04-08 21:17:38 +000016
Dan Shi5e2efb72017-02-07 11:40:23 -080017try:
18 from chromite.lib import metrics
19except ImportError:
20 metrics = utils.metrics_mock
21
mblighf3294cc2009-04-08 21:17:38 +000022
Dan Shic458f662015-04-29 12:12:38 -070023class PeriodicCleanup(object):
24 """Base class to schedule periodical cleanup work.
25 """
mblighf3294cc2009-04-08 21:17:38 +000026
Alex Millerac189f32014-06-23 13:55:23 -070027 def __init__(self, db, clean_interval_minutes, run_at_initialize=False):
mblighf3294cc2009-04-08 21:17:38 +000028 self._db = db
Alex Millerac189f32014-06-23 13:55:23 -070029 self.clean_interval_minutes = clean_interval_minutes
mblighf3294cc2009-04-08 21:17:38 +000030 self._last_clean_time = time.time()
showard915958d2009-04-22 21:00:58 +000031 self._run_at_initialize = run_at_initialize
32
33
34 def initialize(self):
Dan Shic458f662015-04-29 12:12:38 -070035 """Method called by scheduler at the startup.
36 """
showard915958d2009-04-22 21:00:58 +000037 if self._run_at_initialize:
mblighf3294cc2009-04-08 21:17:38 +000038 self._cleanup()
39
40
41 def run_cleanup_maybe(self):
Dan Shic458f662015-04-29 12:12:38 -070042 """Test if cleanup method should be called.
43 """
Alex Millerac189f32014-06-23 13:55:23 -070044 should_cleanup = (self._last_clean_time +
45 self.clean_interval_minutes * 60
mblighf3294cc2009-04-08 21:17:38 +000046 < time.time())
47 if should_cleanup:
48 self._cleanup()
49 self._last_clean_time = time.time()
50
51
52 def _cleanup(self):
53 """Abrstract cleanup method."""
54 raise NotImplementedError
55
56
57class UserCleanup(PeriodicCleanup):
58 """User cleanup that is controlled by the global config variable
Alex Millerac189f32014-06-23 13:55:23 -070059 clean_interval_minutes in the SCHEDULER section.
mblighf3294cc2009-04-08 21:17:38 +000060 """
mblighf3294cc2009-04-08 21:17:38 +000061
62 def __init__(self, db, clean_interval_minutes):
63 super(UserCleanup, self).__init__(db, clean_interval_minutes)
showard8dbd05a2010-01-12 18:54:59 +000064 self._last_reverify_time = time.time()
mblighf3294cc2009-04-08 21:17:38 +000065
66
Aviv Keshetf2c9ac92016-09-16 15:44:26 -070067 @metrics.SecondsTimerDecorator(
68 'chromeos/autotest/scheduler/cleanup/user/durations')
mblighf3294cc2009-04-08 21:17:38 +000069 def _cleanup(self):
mbligh1ef218d2009-08-03 16:57:56 +000070 logging.info('Running periodic cleanup')
71 self._abort_timed_out_jobs()
mbligh1ef218d2009-08-03 16:57:56 +000072 self._abort_jobs_past_max_runtime()
73 self._clear_inactive_blocks()
74 self._check_for_db_inconsistencies()
showard8dbd05a2010-01-12 18:54:59 +000075 self._reverify_dead_hosts()
Simran Basi742b81d2014-05-30 13:51:06 -070076 self._django_session_cleanup()
mblighf3294cc2009-04-08 21:17:38 +000077
78
79 def _abort_timed_out_jobs(self):
80 msg = 'Aborting all jobs that have timed out and are not complete'
81 logging.info(msg)
82 query = models.Job.objects.filter(hostqueueentry__complete=False).extra(
Simran Basi7e605742013-11-12 13:43:36 -080083 where=['created_on + INTERVAL timeout_mins MINUTE < NOW()'])
mblighf3294cc2009-04-08 21:17:38 +000084 for job in query.distinct():
85 logging.warning('Aborting job %d due to job timeout', job.id)
showard64a95952010-01-13 21:27:16 +000086 job.abort()
mblighf3294cc2009-04-08 21:17:38 +000087
88
showard12f3e322009-05-13 21:27:42 +000089 def _abort_jobs_past_max_runtime(self):
90 """
91 Abort executions that have started and are past the job's max runtime.
92 """
93 logging.info('Aborting all jobs that have passed maximum runtime')
94 rows = self._db.execute("""
Dan Shi0a1bb172016-09-17 21:50:17 -070095 SELECT hqe.id FROM afe_host_queue_entries AS hqe
96 WHERE NOT hqe.complete AND NOT hqe.aborted AND EXISTS
97 (select * from afe_jobs where hqe.job_id=afe_jobs.id and
98 hqe.started_on + INTERVAL afe_jobs.max_runtime_mins MINUTE < NOW())
99 """)
showard12f3e322009-05-13 21:27:42 +0000100 query = models.HostQueueEntry.objects.filter(
101 id__in=[row[0] for row in rows])
102 for queue_entry in query.distinct():
103 logging.warning('Aborting entry %s due to max runtime', queue_entry)
showard64a95952010-01-13 21:27:16 +0000104 queue_entry.abort()
showard12f3e322009-05-13 21:27:42 +0000105
106
mblighf3294cc2009-04-08 21:17:38 +0000107 def _check_for_db_inconsistencies(self):
showard01a51672009-05-29 18:42:37 +0000108 logging.info('Cleaning db inconsistencies')
109 self._check_all_invalid_related_objects()
mblighf3294cc2009-04-08 21:17:38 +0000110
showard01a51672009-05-29 18:42:37 +0000111
112 def _check_invalid_related_objects_one_way(self, first_model,
113 relation_field, second_model):
114 if 'invalid' not in first_model.get_field_dict():
115 return []
116 invalid_objects = list(first_model.objects.filter(invalid=True))
117 first_model.objects.populate_relationships(invalid_objects,
118 second_model,
119 'related_objects')
120 error_lines = []
121 for invalid_object in invalid_objects:
122 if invalid_object.related_objects:
123 related_list = ', '.join(str(related_object) for related_object
124 in invalid_object.related_objects)
125 error_lines.append('Invalid %s %s is related to %ss: %s'
126 % (first_model.__name__, invalid_object,
127 second_model.__name__, related_list))
128 related_manager = getattr(invalid_object, relation_field)
129 related_manager.clear()
130 return error_lines
131
132
133 def _check_invalid_related_objects(self, first_model, first_field,
134 second_model, second_field):
135 errors = self._check_invalid_related_objects_one_way(
136 first_model, first_field, second_model)
137 errors.extend(self._check_invalid_related_objects_one_way(
138 second_model, second_field, first_model))
139 return errors
140
141
142 def _check_all_invalid_related_objects(self):
143 model_pairs = ((models.Host, 'labels', models.Label, 'host_set'),
144 (models.AclGroup, 'hosts', models.Host, 'aclgroup_set'),
145 (models.AclGroup, 'users', models.User, 'aclgroup_set'),
146 (models.Test, 'dependency_labels', models.Label,
147 'test_set'))
148 errors = []
149 for first_model, first_field, second_model, second_field in model_pairs:
150 errors.extend(self._check_invalid_related_objects(
151 first_model, first_field, second_model, second_field))
152
153 if errors:
Aviv Keshetc29b4c72016-12-14 22:27:35 -0800154 m = 'chromeos/autotest/scheduler/cleanup/invalid_models_cleaned'
155 metrics.Counter(m).increment_by(len(errors))
156 logging.warn('Cleaned invalid models due to errors: %s'
157 % ('\n'.join(errors)))
mblighf3294cc2009-04-08 21:17:38 +0000158
159 def _clear_inactive_blocks(self):
160 msg = 'Clear out blocks for all completed jobs.'
161 logging.info(msg)
162 # this would be simpler using NOT IN (subquery), but MySQL
163 # treats all IN subqueries as dependent, so this optimizes much
164 # better
165 self._db.execute("""
Dan Shi3762c6b2016-09-16 16:24:07 -0700166 DELETE ihq FROM afe_ineligible_host_queues ihq
167 WHERE NOT EXISTS
168 (SELECT job_id FROM afe_host_queue_entries hqe
169 WHERE NOT hqe.complete AND hqe.job_id = ihq.job_id)""")
mblighf3294cc2009-04-08 21:17:38 +0000170
171
showard8dbd05a2010-01-12 18:54:59 +0000172 def _should_reverify_hosts_now(self):
173 reverify_period_sec = (scheduler_config.config.reverify_period_minutes
174 * 60)
175 if reverify_period_sec == 0:
176 return False
177 return (self._last_reverify_time + reverify_period_sec) <= time.time()
178
179
Eric Lie0493a42010-11-15 13:05:43 -0800180 def _choose_subset_of_hosts_to_reverify(self, hosts):
181 """Given hosts needing verification, return a subset to reverify."""
182 max_at_once = scheduler_config.config.reverify_max_hosts_at_once
183 if (max_at_once > 0 and len(hosts) > max_at_once):
184 return random.sample(hosts, max_at_once)
185 return sorted(hosts)
186
187
showard8dbd05a2010-01-12 18:54:59 +0000188 def _reverify_dead_hosts(self):
189 if not self._should_reverify_hosts_now():
190 return
191
192 self._last_reverify_time = time.time()
193 logging.info('Checking for dead hosts to reverify')
194 hosts = models.Host.objects.filter(
195 status=models.Host.Status.REPAIR_FAILED,
196 locked=False,
197 invalid=False)
198 hosts = hosts.exclude(
199 protection=host_protections.Protection.DO_NOT_VERIFY)
200 if not hosts:
201 return
202
Eric Lie0493a42010-11-15 13:05:43 -0800203 hosts = list(hosts)
204 total_hosts = len(hosts)
205 hosts = self._choose_subset_of_hosts_to_reverify(hosts)
206 logging.info('Reverifying dead hosts (%d of %d) %s', len(hosts),
207 total_hosts, ', '.join(host.hostname for host in hosts))
showard8dbd05a2010-01-12 18:54:59 +0000208 for host in hosts:
showardbe030fb2010-01-15 00:21:20 +0000209 models.SpecialTask.schedule_special_task(
showard8dbd05a2010-01-12 18:54:59 +0000210 host=host, task=models.SpecialTask.Task.VERIFY)
211
212
Simran Basi742b81d2014-05-30 13:51:06 -0700213 def _django_session_cleanup(self):
214 """Clean up django_session since django doesn't for us.
215 http://www.djangoproject.com/documentation/0.96/sessions/
216 """
217 logging.info('Deleting old sessions from django_session')
218 sql = 'TRUNCATE TABLE django_session'
219 self._db.execute(sql)
220
221
mblighf3294cc2009-04-08 21:17:38 +0000222class TwentyFourHourUpkeep(PeriodicCleanup):
223 """Cleanup that runs at the startup of monitor_db and every subsequent
224 twenty four hours.
225 """
226
227
Dan Shic458f662015-04-29 12:12:38 -0700228 def __init__(self, db, drone_manager, run_at_initialize=True):
229 """Initialize TwentyFourHourUpkeep.
230
231 @param db: Database connection object.
232 @param drone_manager: DroneManager to access drones.
233 @param run_at_initialize: True to run cleanup when scheduler starts.
234 Default is set to True.
235
236 """
237 self.drone_manager = drone_manager
Alex Millerac189f32014-06-23 13:55:23 -0700238 clean_interval_minutes = 24 * 60 # 24 hours
mblighf3294cc2009-04-08 21:17:38 +0000239 super(TwentyFourHourUpkeep, self).__init__(
Alex Millerac189f32014-06-23 13:55:23 -0700240 db, clean_interval_minutes, run_at_initialize=run_at_initialize)
mblighf3294cc2009-04-08 21:17:38 +0000241
242
Aviv Keshetf2c9ac92016-09-16 15:44:26 -0700243 @metrics.SecondsTimerDecorator(
244 'chromeos/autotest/scheduler/cleanup/daily/durations')
mblighf3294cc2009-04-08 21:17:38 +0000245 def _cleanup(self):
246 logging.info('Running 24 hour clean up')
showard01a51672009-05-29 18:42:37 +0000247 self._check_for_uncleanable_db_inconsistencies()
Dan Shi55d58992015-05-05 09:10:02 -0700248 self._cleanup_orphaned_containers()
mblighf3294cc2009-04-08 21:17:38 +0000249
250
showard01a51672009-05-29 18:42:37 +0000251 def _check_for_uncleanable_db_inconsistencies(self):
252 logging.info('Checking for uncleanable DB inconsistencies')
253 self._check_for_active_and_complete_queue_entries()
254 self._check_for_multiple_platform_hosts()
255 self._check_for_no_platform_hosts()
showard6157c632009-07-06 20:19:31 +0000256 self._check_for_multiple_atomic_group_hosts()
showard01a51672009-05-29 18:42:37 +0000257
258
259 def _check_for_active_and_complete_queue_entries(self):
260 query = models.HostQueueEntry.objects.filter(active=True, complete=True)
261 if query.count() != 0:
262 subject = ('%d queue entries found with active=complete=1'
263 % query.count())
Simran Basi1c5b0572012-10-11 11:27:51 -0700264 lines = []
265 for entry in query:
266 lines.append(str(entry.get_object_dict()))
267 if entry.status == 'Aborted':
268 logging.error('Aborted entry: %s is both active and '
269 'complete. Setting active value to False.',
270 str(entry))
271 entry.active = False
272 entry.save()
showard01a51672009-05-29 18:42:37 +0000273 self._send_inconsistency_message(subject, lines)
274
275
276 def _check_for_multiple_platform_hosts(self):
277 rows = self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000278 SELECT afe_hosts.id, hostname, COUNT(1) AS platform_count,
279 GROUP_CONCAT(afe_labels.name)
280 FROM afe_hosts
281 INNER JOIN afe_hosts_labels ON
282 afe_hosts.id = afe_hosts_labels.host_id
283 INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
284 WHERE afe_labels.platform
285 GROUP BY afe_hosts.id
showard01a51672009-05-29 18:42:37 +0000286 HAVING platform_count > 1
287 ORDER BY hostname""")
288 if rows:
289 subject = '%s hosts with multiple platforms' % self._db.rowcount
290 lines = [' '.join(str(item) for item in row)
291 for row in rows]
292 self._send_inconsistency_message(subject, lines)
293
294
295 def _check_for_no_platform_hosts(self):
296 rows = self._db.execute("""
297 SELECT hostname
showardeab66ce2009-12-23 00:03:56 +0000298 FROM afe_hosts
299 LEFT JOIN afe_hosts_labels
300 ON afe_hosts.id = afe_hosts_labels.host_id
301 AND afe_hosts_labels.label_id IN (SELECT id FROM afe_labels
302 WHERE platform)
303 WHERE NOT afe_hosts.invalid AND afe_hosts_labels.host_id IS NULL""")
showard01a51672009-05-29 18:42:37 +0000304 if rows:
Ilja H. Friedel04be2bd2014-05-07 21:29:59 -0700305 logging.warning('%s hosts with no platform\n%s', self._db.rowcount,
jamesren675bfe72010-02-19 21:56:13 +0000306 ', '.join(row[0] for row in rows))
showard01a51672009-05-29 18:42:37 +0000307
308
showard6157c632009-07-06 20:19:31 +0000309 def _check_for_multiple_atomic_group_hosts(self):
310 rows = self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000311 SELECT afe_hosts.id, hostname,
312 COUNT(DISTINCT afe_atomic_groups.name) AS atomic_group_count,
313 GROUP_CONCAT(afe_labels.name),
314 GROUP_CONCAT(afe_atomic_groups.name)
315 FROM afe_hosts
316 INNER JOIN afe_hosts_labels ON
317 afe_hosts.id = afe_hosts_labels.host_id
318 INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
319 INNER JOIN afe_atomic_groups ON
320 afe_labels.atomic_group_id = afe_atomic_groups.id
321 WHERE NOT afe_hosts.invalid AND NOT afe_labels.invalid
322 GROUP BY afe_hosts.id
showard6157c632009-07-06 20:19:31 +0000323 HAVING atomic_group_count > 1
324 ORDER BY hostname""")
325 if rows:
326 subject = '%s hosts with multiple atomic groups' % self._db.rowcount
327 lines = [' '.join(str(item) for item in row)
328 for row in rows]
329 self._send_inconsistency_message(subject, lines)
330
331
showard01a51672009-05-29 18:42:37 +0000332 def _send_inconsistency_message(self, subject, lines):
333 logging.error(subject)
334 message = '\n'.join(lines)
335 if len(message) > 5000:
336 message = message[:5000] + '\n(truncated)\n'
337 email_manager.manager.enqueue_notify_email(subject, message)
Dan Shic458f662015-04-29 12:12:38 -0700338
339
Dan Shic458f662015-04-29 12:12:38 -0700340 def _cleanup_orphaned_containers(self):
341 """Cleanup orphaned containers in each drone.
342
343 The function queues a lxc_cleanup call in each drone without waiting for
344 the script to finish, as the cleanup procedure could take minutes and the
345 script output is logged.
346
347 """
348 ssp_enabled = global_config.global_config.get_config_value(
349 'AUTOSERV', 'enable_ssp_container')
350 if not ssp_enabled:
351 logging.info('Server-side packaging is not enabled, no need to clean'
352 ' up orphaned containers.')
353 return
354 self.drone_manager.cleanup_orphaned_containers()