blob: 04f6d1c16211ceaed11c2cf542efd8490d683051 [file] [log] [blame]
mblighf3294cc2009-04-08 21:17:38 +00001"""
2Autotest AFE Cleanup used by the scheduler
3"""
4
5
Michael Liangda8c60a2014-06-03 13:24:51 -07006import time, logging, random
7
mblighf3294cc2009-04-08 21:17:38 +00008from autotest_lib.frontend.afe import models
Dan Shic458f662015-04-29 12:12:38 -07009from autotest_lib.scheduler import email_manager
10from autotest_lib.scheduler import scheduler_config
11from autotest_lib.client.common_lib import global_config
showard8dbd05a2010-01-12 18:54:59 +000012from autotest_lib.client.common_lib import host_protections
Gabe Black1e1c41b2015-02-04 23:55:15 -080013from autotest_lib.client.common_lib.cros.graphite import autotest_stats
mblighf3294cc2009-04-08 21:17:38 +000014
mblighf3294cc2009-04-08 21:17:38 +000015
Dan Shic458f662015-04-29 12:12:38 -070016class PeriodicCleanup(object):
17 """Base class to schedule periodical cleanup work.
18 """
mblighf3294cc2009-04-08 21:17:38 +000019
Alex Millerac189f32014-06-23 13:55:23 -070020 def __init__(self, db, clean_interval_minutes, run_at_initialize=False):
mblighf3294cc2009-04-08 21:17:38 +000021 self._db = db
Alex Millerac189f32014-06-23 13:55:23 -070022 self.clean_interval_minutes = clean_interval_minutes
mblighf3294cc2009-04-08 21:17:38 +000023 self._last_clean_time = time.time()
showard915958d2009-04-22 21:00:58 +000024 self._run_at_initialize = run_at_initialize
25
26
27 def initialize(self):
Dan Shic458f662015-04-29 12:12:38 -070028 """Method called by scheduler at the startup.
29 """
showard915958d2009-04-22 21:00:58 +000030 if self._run_at_initialize:
mblighf3294cc2009-04-08 21:17:38 +000031 self._cleanup()
32
33
34 def run_cleanup_maybe(self):
Dan Shic458f662015-04-29 12:12:38 -070035 """Test if cleanup method should be called.
36 """
Alex Millerac189f32014-06-23 13:55:23 -070037 should_cleanup = (self._last_clean_time +
38 self.clean_interval_minutes * 60
mblighf3294cc2009-04-08 21:17:38 +000039 < time.time())
40 if should_cleanup:
41 self._cleanup()
42 self._last_clean_time = time.time()
43
44
45 def _cleanup(self):
46 """Abrstract cleanup method."""
47 raise NotImplementedError
48
49
50class UserCleanup(PeriodicCleanup):
51 """User cleanup that is controlled by the global config variable
Alex Millerac189f32014-06-23 13:55:23 -070052 clean_interval_minutes in the SCHEDULER section.
mblighf3294cc2009-04-08 21:17:38 +000053 """
Gabe Black1e1c41b2015-02-04 23:55:15 -080054 timer = autotest_stats.Timer('monitor_db_cleanup.user_cleanup')
mblighf3294cc2009-04-08 21:17:38 +000055
56
57 def __init__(self, db, clean_interval_minutes):
58 super(UserCleanup, self).__init__(db, clean_interval_minutes)
showard8dbd05a2010-01-12 18:54:59 +000059 self._last_reverify_time = time.time()
mblighf3294cc2009-04-08 21:17:38 +000060
61
Alex Milleree632912013-10-08 16:03:12 -070062 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +000063 def _cleanup(self):
mbligh1ef218d2009-08-03 16:57:56 +000064 logging.info('Running periodic cleanup')
65 self._abort_timed_out_jobs()
mbligh1ef218d2009-08-03 16:57:56 +000066 self._abort_jobs_past_max_runtime()
67 self._clear_inactive_blocks()
68 self._check_for_db_inconsistencies()
showard8dbd05a2010-01-12 18:54:59 +000069 self._reverify_dead_hosts()
Simran Basi742b81d2014-05-30 13:51:06 -070070 self._django_session_cleanup()
mblighf3294cc2009-04-08 21:17:38 +000071
72
Alex Milleree632912013-10-08 16:03:12 -070073 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +000074 def _abort_timed_out_jobs(self):
75 msg = 'Aborting all jobs that have timed out and are not complete'
76 logging.info(msg)
77 query = models.Job.objects.filter(hostqueueentry__complete=False).extra(
Simran Basi7e605742013-11-12 13:43:36 -080078 where=['created_on + INTERVAL timeout_mins MINUTE < NOW()'])
mblighf3294cc2009-04-08 21:17:38 +000079 for job in query.distinct():
80 logging.warning('Aborting job %d due to job timeout', job.id)
showard64a95952010-01-13 21:27:16 +000081 job.abort()
mblighf3294cc2009-04-08 21:17:38 +000082
83
Alex Milleree632912013-10-08 16:03:12 -070084 @timer.decorate
showard12f3e322009-05-13 21:27:42 +000085 def _abort_jobs_past_max_runtime(self):
86 """
87 Abort executions that have started and are past the job's max runtime.
88 """
89 logging.info('Aborting all jobs that have passed maximum runtime')
90 rows = self._db.execute("""
91 SELECT hqe.id
showardeab66ce2009-12-23 00:03:56 +000092 FROM afe_host_queue_entries AS hqe
93 INNER JOIN afe_jobs ON (hqe.job_id = afe_jobs.id)
showard12f3e322009-05-13 21:27:42 +000094 WHERE NOT hqe.complete AND NOT hqe.aborted AND
Simran Basi34217022012-11-06 13:43:15 -080095 hqe.started_on + INTERVAL afe_jobs.max_runtime_mins MINUTE <
96 NOW()""")
showard12f3e322009-05-13 21:27:42 +000097 query = models.HostQueueEntry.objects.filter(
98 id__in=[row[0] for row in rows])
99 for queue_entry in query.distinct():
100 logging.warning('Aborting entry %s due to max runtime', queue_entry)
showard64a95952010-01-13 21:27:16 +0000101 queue_entry.abort()
showard12f3e322009-05-13 21:27:42 +0000102
103
Alex Milleree632912013-10-08 16:03:12 -0700104 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +0000105 def _check_for_db_inconsistencies(self):
showard01a51672009-05-29 18:42:37 +0000106 logging.info('Cleaning db inconsistencies')
107 self._check_all_invalid_related_objects()
mblighf3294cc2009-04-08 21:17:38 +0000108
showard01a51672009-05-29 18:42:37 +0000109
110 def _check_invalid_related_objects_one_way(self, first_model,
111 relation_field, second_model):
112 if 'invalid' not in first_model.get_field_dict():
113 return []
114 invalid_objects = list(first_model.objects.filter(invalid=True))
115 first_model.objects.populate_relationships(invalid_objects,
116 second_model,
117 'related_objects')
118 error_lines = []
119 for invalid_object in invalid_objects:
120 if invalid_object.related_objects:
121 related_list = ', '.join(str(related_object) for related_object
122 in invalid_object.related_objects)
123 error_lines.append('Invalid %s %s is related to %ss: %s'
124 % (first_model.__name__, invalid_object,
125 second_model.__name__, related_list))
126 related_manager = getattr(invalid_object, relation_field)
127 related_manager.clear()
128 return error_lines
129
130
131 def _check_invalid_related_objects(self, first_model, first_field,
132 second_model, second_field):
133 errors = self._check_invalid_related_objects_one_way(
134 first_model, first_field, second_model)
135 errors.extend(self._check_invalid_related_objects_one_way(
136 second_model, second_field, first_model))
137 return errors
138
139
140 def _check_all_invalid_related_objects(self):
141 model_pairs = ((models.Host, 'labels', models.Label, 'host_set'),
142 (models.AclGroup, 'hosts', models.Host, 'aclgroup_set'),
143 (models.AclGroup, 'users', models.User, 'aclgroup_set'),
144 (models.Test, 'dependency_labels', models.Label,
145 'test_set'))
146 errors = []
147 for first_model, first_field, second_model, second_field in model_pairs:
148 errors.extend(self._check_invalid_related_objects(
149 first_model, first_field, second_model, second_field))
150
151 if errors:
152 subject = ('%s relationships to invalid models, cleaned all' %
153 len(errors))
154 message = '\n'.join(errors)
155 logging.warning(subject)
156 logging.warning(message)
mblighf3294cc2009-04-08 21:17:38 +0000157 email_manager.manager.enqueue_notify_email(subject, message)
158
159
Alex Milleree632912013-10-08 16:03:12 -0700160 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +0000161 def _clear_inactive_blocks(self):
162 msg = 'Clear out blocks for all completed jobs.'
163 logging.info(msg)
164 # this would be simpler using NOT IN (subquery), but MySQL
165 # treats all IN subqueries as dependent, so this optimizes much
166 # better
167 self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000168 DELETE ihq FROM afe_ineligible_host_queues ihq
169 LEFT JOIN (SELECT DISTINCT job_id FROM afe_host_queue_entries
mblighf3294cc2009-04-08 21:17:38 +0000170 WHERE NOT complete) hqe
171 USING (job_id) WHERE hqe.job_id IS NULL""")
172
173
showard8dbd05a2010-01-12 18:54:59 +0000174 def _should_reverify_hosts_now(self):
175 reverify_period_sec = (scheduler_config.config.reverify_period_minutes
176 * 60)
177 if reverify_period_sec == 0:
178 return False
179 return (self._last_reverify_time + reverify_period_sec) <= time.time()
180
181
Eric Lie0493a42010-11-15 13:05:43 -0800182 def _choose_subset_of_hosts_to_reverify(self, hosts):
183 """Given hosts needing verification, return a subset to reverify."""
184 max_at_once = scheduler_config.config.reverify_max_hosts_at_once
185 if (max_at_once > 0 and len(hosts) > max_at_once):
186 return random.sample(hosts, max_at_once)
187 return sorted(hosts)
188
189
Alex Milleree632912013-10-08 16:03:12 -0700190 @timer.decorate
showard8dbd05a2010-01-12 18:54:59 +0000191 def _reverify_dead_hosts(self):
192 if not self._should_reverify_hosts_now():
193 return
194
195 self._last_reverify_time = time.time()
196 logging.info('Checking for dead hosts to reverify')
197 hosts = models.Host.objects.filter(
198 status=models.Host.Status.REPAIR_FAILED,
199 locked=False,
200 invalid=False)
201 hosts = hosts.exclude(
202 protection=host_protections.Protection.DO_NOT_VERIFY)
203 if not hosts:
204 return
205
Eric Lie0493a42010-11-15 13:05:43 -0800206 hosts = list(hosts)
207 total_hosts = len(hosts)
208 hosts = self._choose_subset_of_hosts_to_reverify(hosts)
209 logging.info('Reverifying dead hosts (%d of %d) %s', len(hosts),
210 total_hosts, ', '.join(host.hostname for host in hosts))
showard8dbd05a2010-01-12 18:54:59 +0000211 for host in hosts:
showardbe030fb2010-01-15 00:21:20 +0000212 models.SpecialTask.schedule_special_task(
showard8dbd05a2010-01-12 18:54:59 +0000213 host=host, task=models.SpecialTask.Task.VERIFY)
214
215
Simran Basi742b81d2014-05-30 13:51:06 -0700216 @timer.decorate
217 def _django_session_cleanup(self):
218 """Clean up django_session since django doesn't for us.
219 http://www.djangoproject.com/documentation/0.96/sessions/
220 """
221 logging.info('Deleting old sessions from django_session')
222 sql = 'TRUNCATE TABLE django_session'
223 self._db.execute(sql)
224
225
mblighf3294cc2009-04-08 21:17:38 +0000226class TwentyFourHourUpkeep(PeriodicCleanup):
227 """Cleanup that runs at the startup of monitor_db and every subsequent
228 twenty four hours.
229 """
Gabe Black1e1c41b2015-02-04 23:55:15 -0800230 timer = autotest_stats.Timer('monitor_db_cleanup.twentyfourhour_cleanup')
mblighf3294cc2009-04-08 21:17:38 +0000231
232
Dan Shic458f662015-04-29 12:12:38 -0700233 def __init__(self, db, drone_manager, run_at_initialize=True):
234 """Initialize TwentyFourHourUpkeep.
235
236 @param db: Database connection object.
237 @param drone_manager: DroneManager to access drones.
238 @param run_at_initialize: True to run cleanup when scheduler starts.
239 Default is set to True.
240
241 """
242 self.drone_manager = drone_manager
Alex Millerac189f32014-06-23 13:55:23 -0700243 clean_interval_minutes = 24 * 60 # 24 hours
mblighf3294cc2009-04-08 21:17:38 +0000244 super(TwentyFourHourUpkeep, self).__init__(
Alex Millerac189f32014-06-23 13:55:23 -0700245 db, clean_interval_minutes, run_at_initialize=run_at_initialize)
mblighf3294cc2009-04-08 21:17:38 +0000246
247
Alex Milleree632912013-10-08 16:03:12 -0700248 @timer.decorate
mblighf3294cc2009-04-08 21:17:38 +0000249 def _cleanup(self):
250 logging.info('Running 24 hour clean up')
showard01a51672009-05-29 18:42:37 +0000251 self._check_for_uncleanable_db_inconsistencies()
Dan Shi4ecb0042015-05-03 23:09:32 -0700252 # TODO(dshi): crbug.com/484039, after the bug is fixed, re-enable this.
253 #self._cleanup_orphaned_containers()
mblighf3294cc2009-04-08 21:17:38 +0000254
255
Alex Milleree632912013-10-08 16:03:12 -0700256 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000257 def _check_for_uncleanable_db_inconsistencies(self):
258 logging.info('Checking for uncleanable DB inconsistencies')
259 self._check_for_active_and_complete_queue_entries()
260 self._check_for_multiple_platform_hosts()
261 self._check_for_no_platform_hosts()
showard6157c632009-07-06 20:19:31 +0000262 self._check_for_multiple_atomic_group_hosts()
showard01a51672009-05-29 18:42:37 +0000263
264
Alex Milleree632912013-10-08 16:03:12 -0700265 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000266 def _check_for_active_and_complete_queue_entries(self):
267 query = models.HostQueueEntry.objects.filter(active=True, complete=True)
268 if query.count() != 0:
269 subject = ('%d queue entries found with active=complete=1'
270 % query.count())
Simran Basi1c5b0572012-10-11 11:27:51 -0700271 lines = []
272 for entry in query:
273 lines.append(str(entry.get_object_dict()))
274 if entry.status == 'Aborted':
275 logging.error('Aborted entry: %s is both active and '
276 'complete. Setting active value to False.',
277 str(entry))
278 entry.active = False
279 entry.save()
showard01a51672009-05-29 18:42:37 +0000280 self._send_inconsistency_message(subject, lines)
281
282
Alex Milleree632912013-10-08 16:03:12 -0700283 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000284 def _check_for_multiple_platform_hosts(self):
285 rows = self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000286 SELECT afe_hosts.id, hostname, COUNT(1) AS platform_count,
287 GROUP_CONCAT(afe_labels.name)
288 FROM afe_hosts
289 INNER JOIN afe_hosts_labels ON
290 afe_hosts.id = afe_hosts_labels.host_id
291 INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
292 WHERE afe_labels.platform
293 GROUP BY afe_hosts.id
showard01a51672009-05-29 18:42:37 +0000294 HAVING platform_count > 1
295 ORDER BY hostname""")
296 if rows:
297 subject = '%s hosts with multiple platforms' % self._db.rowcount
298 lines = [' '.join(str(item) for item in row)
299 for row in rows]
300 self._send_inconsistency_message(subject, lines)
301
302
Alex Milleree632912013-10-08 16:03:12 -0700303 @timer.decorate
showard01a51672009-05-29 18:42:37 +0000304 def _check_for_no_platform_hosts(self):
305 rows = self._db.execute("""
306 SELECT hostname
showardeab66ce2009-12-23 00:03:56 +0000307 FROM afe_hosts
308 LEFT JOIN afe_hosts_labels
309 ON afe_hosts.id = afe_hosts_labels.host_id
310 AND afe_hosts_labels.label_id IN (SELECT id FROM afe_labels
311 WHERE platform)
312 WHERE NOT afe_hosts.invalid AND afe_hosts_labels.host_id IS NULL""")
showard01a51672009-05-29 18:42:37 +0000313 if rows:
Ilja H. Friedel04be2bd2014-05-07 21:29:59 -0700314 logging.warning('%s hosts with no platform\n%s', self._db.rowcount,
jamesren675bfe72010-02-19 21:56:13 +0000315 ', '.join(row[0] for row in rows))
showard01a51672009-05-29 18:42:37 +0000316
317
Alex Milleree632912013-10-08 16:03:12 -0700318 @timer.decorate
showard6157c632009-07-06 20:19:31 +0000319 def _check_for_multiple_atomic_group_hosts(self):
320 rows = self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000321 SELECT afe_hosts.id, hostname,
322 COUNT(DISTINCT afe_atomic_groups.name) AS atomic_group_count,
323 GROUP_CONCAT(afe_labels.name),
324 GROUP_CONCAT(afe_atomic_groups.name)
325 FROM afe_hosts
326 INNER JOIN afe_hosts_labels ON
327 afe_hosts.id = afe_hosts_labels.host_id
328 INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
329 INNER JOIN afe_atomic_groups ON
330 afe_labels.atomic_group_id = afe_atomic_groups.id
331 WHERE NOT afe_hosts.invalid AND NOT afe_labels.invalid
332 GROUP BY afe_hosts.id
showard6157c632009-07-06 20:19:31 +0000333 HAVING atomic_group_count > 1
334 ORDER BY hostname""")
335 if rows:
336 subject = '%s hosts with multiple atomic groups' % self._db.rowcount
337 lines = [' '.join(str(item) for item in row)
338 for row in rows]
339 self._send_inconsistency_message(subject, lines)
340
341
showard01a51672009-05-29 18:42:37 +0000342 def _send_inconsistency_message(self, subject, lines):
343 logging.error(subject)
344 message = '\n'.join(lines)
345 if len(message) > 5000:
346 message = message[:5000] + '\n(truncated)\n'
347 email_manager.manager.enqueue_notify_email(subject, message)
Dan Shic458f662015-04-29 12:12:38 -0700348
349
350 @timer.decorate
351 def _cleanup_orphaned_containers(self):
352 """Cleanup orphaned containers in each drone.
353
354 The function queues a lxc_cleanup call in each drone without waiting for
355 the script to finish, as the cleanup procedure could take minutes and the
356 script output is logged.
357
358 """
359 ssp_enabled = global_config.global_config.get_config_value(
360 'AUTOSERV', 'enable_ssp_container')
361 if not ssp_enabled:
362 logging.info('Server-side packaging is not enabled, no need to clean'
363 ' up orphaned containers.')
364 return
365 self.drone_manager.cleanup_orphaned_containers()