blob: dfe2075a5b0ee1b63b725d5a62d3ae1cf62c0224 [file] [log] [blame]
mblighf3294cc2009-04-08 21:17:38 +00001"""
2Autotest AFE Cleanup used by the scheduler
3"""
4
5
6import datetime, time, logging
7import common
8from autotest_lib.database import database_connection
9from autotest_lib.frontend.afe import models
10from autotest_lib.scheduler import email_manager, scheduler_config
11
12
13class PeriodicCleanup(object):
14
15
16 def __init__(self, db, clean_interval, run_at_initialize=False):
17 self._db = db
18 self.clean_interval = clean_interval
19 self._last_clean_time = time.time()
showard915958d2009-04-22 21:00:58 +000020 self._run_at_initialize = run_at_initialize
21
22
23 def initialize(self):
24 if self._run_at_initialize:
mblighf3294cc2009-04-08 21:17:38 +000025 self._cleanup()
26
27
28 def run_cleanup_maybe(self):
29 should_cleanup = (self._last_clean_time + self.clean_interval * 60
30 < time.time())
31 if should_cleanup:
32 self._cleanup()
33 self._last_clean_time = time.time()
34
35
36 def _cleanup(self):
37 """Abrstract cleanup method."""
38 raise NotImplementedError
39
40
41class UserCleanup(PeriodicCleanup):
42 """User cleanup that is controlled by the global config variable
43 clean_interval in the SCHEDULER section.
44 """
45
46
47 def __init__(self, db, clean_interval_minutes):
48 super(UserCleanup, self).__init__(db, clean_interval_minutes)
49
50
51 def _cleanup(self):
mbligh1ef218d2009-08-03 16:57:56 +000052 logging.info('Running periodic cleanup')
53 self._abort_timed_out_jobs()
mbligh1ef218d2009-08-03 16:57:56 +000054 self._abort_jobs_past_max_runtime()
55 self._clear_inactive_blocks()
56 self._check_for_db_inconsistencies()
mblighf3294cc2009-04-08 21:17:38 +000057
58
59 def _abort_timed_out_jobs(self):
60 msg = 'Aborting all jobs that have timed out and are not complete'
61 logging.info(msg)
62 query = models.Job.objects.filter(hostqueueentry__complete=False).extra(
63 where=['created_on + INTERVAL timeout HOUR < NOW()'])
64 for job in query.distinct():
65 logging.warning('Aborting job %d due to job timeout', job.id)
66 job.abort(None)
67
68
showard12f3e322009-05-13 21:27:42 +000069 def _abort_jobs_past_max_runtime(self):
70 """
71 Abort executions that have started and are past the job's max runtime.
72 """
73 logging.info('Aborting all jobs that have passed maximum runtime')
74 rows = self._db.execute("""
75 SELECT hqe.id
showardeab66ce2009-12-23 00:03:56 +000076 FROM afe_host_queue_entries AS hqe
77 INNER JOIN afe_jobs ON (hqe.job_id = afe_jobs.id)
showard12f3e322009-05-13 21:27:42 +000078 WHERE NOT hqe.complete AND NOT hqe.aborted AND
showardeab66ce2009-12-23 00:03:56 +000079 hqe.started_on + INTERVAL afe_jobs.max_runtime_hrs HOUR < NOW()""")
showard12f3e322009-05-13 21:27:42 +000080 query = models.HostQueueEntry.objects.filter(
81 id__in=[row[0] for row in rows])
82 for queue_entry in query.distinct():
83 logging.warning('Aborting entry %s due to max runtime', queue_entry)
84 queue_entry.abort(None)
85
86
mblighf3294cc2009-04-08 21:17:38 +000087 def _check_for_db_inconsistencies(self):
showard01a51672009-05-29 18:42:37 +000088 logging.info('Cleaning db inconsistencies')
89 self._check_all_invalid_related_objects()
mblighf3294cc2009-04-08 21:17:38 +000090
showard01a51672009-05-29 18:42:37 +000091
92 def _check_invalid_related_objects_one_way(self, first_model,
93 relation_field, second_model):
94 if 'invalid' not in first_model.get_field_dict():
95 return []
96 invalid_objects = list(first_model.objects.filter(invalid=True))
97 first_model.objects.populate_relationships(invalid_objects,
98 second_model,
99 'related_objects')
100 error_lines = []
101 for invalid_object in invalid_objects:
102 if invalid_object.related_objects:
103 related_list = ', '.join(str(related_object) for related_object
104 in invalid_object.related_objects)
105 error_lines.append('Invalid %s %s is related to %ss: %s'
106 % (first_model.__name__, invalid_object,
107 second_model.__name__, related_list))
108 related_manager = getattr(invalid_object, relation_field)
109 related_manager.clear()
110 return error_lines
111
112
113 def _check_invalid_related_objects(self, first_model, first_field,
114 second_model, second_field):
115 errors = self._check_invalid_related_objects_one_way(
116 first_model, first_field, second_model)
117 errors.extend(self._check_invalid_related_objects_one_way(
118 second_model, second_field, first_model))
119 return errors
120
121
122 def _check_all_invalid_related_objects(self):
123 model_pairs = ((models.Host, 'labels', models.Label, 'host_set'),
124 (models.AclGroup, 'hosts', models.Host, 'aclgroup_set'),
125 (models.AclGroup, 'users', models.User, 'aclgroup_set'),
126 (models.Test, 'dependency_labels', models.Label,
127 'test_set'))
128 errors = []
129 for first_model, first_field, second_model, second_field in model_pairs:
130 errors.extend(self._check_invalid_related_objects(
131 first_model, first_field, second_model, second_field))
132
133 if errors:
134 subject = ('%s relationships to invalid models, cleaned all' %
135 len(errors))
136 message = '\n'.join(errors)
137 logging.warning(subject)
138 logging.warning(message)
mblighf3294cc2009-04-08 21:17:38 +0000139 email_manager.manager.enqueue_notify_email(subject, message)
140
141
142 def _clear_inactive_blocks(self):
143 msg = 'Clear out blocks for all completed jobs.'
144 logging.info(msg)
145 # this would be simpler using NOT IN (subquery), but MySQL
146 # treats all IN subqueries as dependent, so this optimizes much
147 # better
148 self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000149 DELETE ihq FROM afe_ineligible_host_queues ihq
150 LEFT JOIN (SELECT DISTINCT job_id FROM afe_host_queue_entries
mblighf3294cc2009-04-08 21:17:38 +0000151 WHERE NOT complete) hqe
152 USING (job_id) WHERE hqe.job_id IS NULL""")
153
154
155class TwentyFourHourUpkeep(PeriodicCleanup):
156 """Cleanup that runs at the startup of monitor_db and every subsequent
157 twenty four hours.
158 """
159
160
161 def __init__(self, db, run_at_initialize=True):
162 clean_interval = 24 * 60 # 24 hours
163 super(TwentyFourHourUpkeep, self).__init__(
164 db, clean_interval, run_at_initialize=run_at_initialize)
165
166
167 def _cleanup(self):
168 logging.info('Running 24 hour clean up')
169 self._django_session_cleanup()
showard01a51672009-05-29 18:42:37 +0000170 self._check_for_uncleanable_db_inconsistencies()
mblighf3294cc2009-04-08 21:17:38 +0000171
172
173 def _django_session_cleanup(self):
174 """Clean up django_session since django doesn't for us.
175 http://www.djangoproject.com/documentation/0.96/sessions/
176 """
177 logging.info('Deleting old sessions from django_session')
178 sql = 'DELETE FROM django_session WHERE expire_date < NOW()'
179 self._db.execute(sql)
showard01a51672009-05-29 18:42:37 +0000180
181
182 def _check_for_uncleanable_db_inconsistencies(self):
183 logging.info('Checking for uncleanable DB inconsistencies')
184 self._check_for_active_and_complete_queue_entries()
185 self._check_for_multiple_platform_hosts()
186 self._check_for_no_platform_hosts()
showard6157c632009-07-06 20:19:31 +0000187 self._check_for_multiple_atomic_group_hosts()
showard01a51672009-05-29 18:42:37 +0000188
189
190 def _check_for_active_and_complete_queue_entries(self):
191 query = models.HostQueueEntry.objects.filter(active=True, complete=True)
192 if query.count() != 0:
193 subject = ('%d queue entries found with active=complete=1'
194 % query.count())
195 lines = [str(entry.get_object_dict()) for entry in query]
196 self._send_inconsistency_message(subject, lines)
197
198
199 def _check_for_multiple_platform_hosts(self):
200 rows = self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000201 SELECT afe_hosts.id, hostname, COUNT(1) AS platform_count,
202 GROUP_CONCAT(afe_labels.name)
203 FROM afe_hosts
204 INNER JOIN afe_hosts_labels ON
205 afe_hosts.id = afe_hosts_labels.host_id
206 INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
207 WHERE afe_labels.platform
208 GROUP BY afe_hosts.id
showard01a51672009-05-29 18:42:37 +0000209 HAVING platform_count > 1
210 ORDER BY hostname""")
211 if rows:
212 subject = '%s hosts with multiple platforms' % self._db.rowcount
213 lines = [' '.join(str(item) for item in row)
214 for row in rows]
215 self._send_inconsistency_message(subject, lines)
216
217
218 def _check_for_no_platform_hosts(self):
219 rows = self._db.execute("""
220 SELECT hostname
showardeab66ce2009-12-23 00:03:56 +0000221 FROM afe_hosts
222 LEFT JOIN afe_hosts_labels
223 ON afe_hosts.id = afe_hosts_labels.host_id
224 AND afe_hosts_labels.label_id IN (SELECT id FROM afe_labels
225 WHERE platform)
226 WHERE NOT afe_hosts.invalid AND afe_hosts_labels.host_id IS NULL""")
showard01a51672009-05-29 18:42:37 +0000227 if rows:
228 subject = '%s hosts with no platform' % self._db.rowcount
229 self._send_inconsistency_message(
230 subject, [', '.join(row[0] for row in rows)])
231
232
showard6157c632009-07-06 20:19:31 +0000233 def _check_for_multiple_atomic_group_hosts(self):
234 rows = self._db.execute("""
showardeab66ce2009-12-23 00:03:56 +0000235 SELECT afe_hosts.id, hostname,
236 COUNT(DISTINCT afe_atomic_groups.name) AS atomic_group_count,
237 GROUP_CONCAT(afe_labels.name),
238 GROUP_CONCAT(afe_atomic_groups.name)
239 FROM afe_hosts
240 INNER JOIN afe_hosts_labels ON
241 afe_hosts.id = afe_hosts_labels.host_id
242 INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
243 INNER JOIN afe_atomic_groups ON
244 afe_labels.atomic_group_id = afe_atomic_groups.id
245 WHERE NOT afe_hosts.invalid AND NOT afe_labels.invalid
246 GROUP BY afe_hosts.id
showard6157c632009-07-06 20:19:31 +0000247 HAVING atomic_group_count > 1
248 ORDER BY hostname""")
249 if rows:
250 subject = '%s hosts with multiple atomic groups' % self._db.rowcount
251 lines = [' '.join(str(item) for item in row)
252 for row in rows]
253 self._send_inconsistency_message(subject, lines)
254
255
showard01a51672009-05-29 18:42:37 +0000256 def _send_inconsistency_message(self, subject, lines):
257 logging.error(subject)
258 message = '\n'.join(lines)
259 if len(message) > 5000:
260 message = message[:5000] + '\n(truncated)\n'
261 email_manager.manager.enqueue_notify_email(subject, message)