blob: 07846c5c9d7411de72cbc8aa8d55a029e8de3114 [file] [log] [blame]
Kristof Beyls6255ac02018-05-18 13:02:32 +00001#!/usr/bin/env python
2
3import argparse
4import email.mime.multipart
5import email.mime.text
6import logging
7import os.path
8import pickle
9import re
10import smtplib
11import subprocess
12import sys
13from datetime import datetime, timedelta
14from phabricator import Phabricator
15
16# Setting up a virtualenv to run this script can be done by running the
17# following commands:
18# $ virtualenv venv
19# $ . ./venv/bin/activate
20# $ pip install Phabricator
21
22GIT_REPO_METADATA = (("llvm", "https://llvm.org/git/llvm.git"), )
23
24# The below PhabXXX classes represent objects as modelled by Phabricator.
25# The classes can be serialized to disk, to try and make sure that we don't
26# needlessly have to re-fetch lots of data from Phabricator, as that would
27# make this script unusably slow.
28
29
30class PhabObject:
31 OBJECT_KIND = None
32
33 def __init__(self, id):
34 self.id = id
35
36
37class PhabObjectCache:
38 def __init__(self, PhabObjectClass):
39 self.PhabObjectClass = PhabObjectClass
40 self.most_recent_info = None
41 self.oldest_info = None
42 self.id2PhabObjects = {}
43
44 def get_name(self):
45 return self.PhabObjectClass.OBJECT_KIND + "sCache"
46
47 def get(self, id):
48 if id not in self.id2PhabObjects:
49 self.id2PhabObjects[id] = self.PhabObjectClass(id)
50 return self.id2PhabObjects[id]
51
52 def get_ids_in_cache(self):
53 return self.id2PhabObjects.keys()
54
55 def get_objects(self):
56 return self.id2PhabObjects.values()
57
58 DEFAULT_DIRECTORY = "PhabObjectCache"
59
60 def _get_pickle_name(self, directory):
61 file_name = "Phab" + self.PhabObjectClass.OBJECT_KIND + "s.pickle"
62 return os.path.join(directory, file_name)
63
64 def populate_cache_from_disk(self, directory=DEFAULT_DIRECTORY):
65 """
66 FIXME: consider if serializing to JSON would bring interoperability
67 advantages over serializing to pickle.
68 """
69 try:
70 f = open(self._get_pickle_name(directory), "rb")
71 except IOError as err:
72 print("Could not find cache. Error message: {0}. Continuing..."
73 .format(err))
74 else:
75 with f:
76 try:
77 d = pickle.load(f)
78 self.__dict__.update(d)
79 except EOFError as err:
80 print("Cache seems to be corrupt. " +
81 "Not using cache. Error message: {0}".format(err))
82
83 def write_cache_to_disk(self, directory=DEFAULT_DIRECTORY):
84 if not os.path.exists(directory):
85 os.makedirs(directory)
86 with open(self._get_pickle_name(directory), "wb") as f:
87 pickle.dump(self.__dict__, f)
88 print("wrote cache to disk, most_recent_info= {0}".format(
89 datetime.fromtimestamp(self.most_recent_info)
90 if self.most_recent_info is not None else None))
91
92
93class PhabReview(PhabObject):
94 OBJECT_KIND = "Review"
95
96 def __init__(self, id):
97 PhabObject.__init__(self, id)
98
99 def update(self, title, dateCreated, dateModified, author):
100 self.title = title
101 self.dateCreated = dateCreated
102 self.dateModified = dateModified
103 self.author = author
104
105 def setPhabDiffs(self, phabDiffs):
106 self.phabDiffs = phabDiffs
107
108
109class PhabUser(PhabObject):
110 OBJECT_KIND = "User"
111
112 def __init__(self, id):
113 PhabObject.__init__(self, id)
114
115 def update(self, phid, realName):
116 self.phid = phid
117 self.realName = realName
118
119
120class PhabHunk:
121 def __init__(self, rest_api_hunk):
122 self.oldOffset = int(rest_api_hunk["oldOffset"])
123 self.oldLength = int(rest_api_hunk["oldLength"])
124 # self.actual_lines_changed_offset will contain the offsets of the
125 # lines that were changed in this hunk.
126 self.actual_lines_changed_offset = []
127 offset = self.oldOffset
128 inHunk = False
129 hunkStart = -1
130 contextLines = 3
131 for line in rest_api_hunk["corpus"].split("\n"):
132 if line.startswith("+"):
133 # line is a new line that got introduced in this patch.
134 # Do not record it as a changed line.
135 if inHunk is False:
136 inHunk = True
137 hunkStart = max(self.oldOffset, offset - contextLines)
138 continue
139 if line.startswith("-"):
140 # line was changed or removed from the older version of the
141 # code. Record it as a changed line.
142 if inHunk is False:
143 inHunk = True
144 hunkStart = max(self.oldOffset, offset - contextLines)
145 offset += 1
146 continue
147 # line is a context line.
148 if inHunk is True:
149 inHunk = False
150 hunkEnd = offset + contextLines
151 self.actual_lines_changed_offset.append((hunkStart, hunkEnd))
152 offset += 1
153 if inHunk is True:
154 hunkEnd = offset + contextLines
155 self.actual_lines_changed_offset.append((hunkStart, hunkEnd))
156
157 # The above algorithm could result in adjacent or overlapping ranges
158 # being recorded into self.actual_lines_changed_offset.
159 # Merge the adjacent and overlapping ranges in there:
160 t = []
161 lastRange = None
162 for start, end in self.actual_lines_changed_offset + \
163 [(sys.maxsize, sys.maxsize)]:
164 if lastRange is None:
165 lastRange = (start, end)
166 else:
167 if lastRange[1] >= start:
168 lastRange = (lastRange[0], end)
169 else:
170 t.append(lastRange)
171 lastRange = (start, end)
172 self.actual_lines_changed_offset = t
173
174
175class PhabChange:
176 def __init__(self, rest_api_change):
177 self.oldPath = rest_api_change["oldPath"]
178 self.hunks = [PhabHunk(h) for h in rest_api_change["hunks"]]
179
180
181class PhabDiff(PhabObject):
182 OBJECT_KIND = "Diff"
183
184 def __init__(self, id):
185 PhabObject.__init__(self, id)
186
187 def update(self, rest_api_results):
188 self.revisionID = rest_api_results["revisionID"]
189 self.dateModified = int(rest_api_results["dateModified"])
190 self.dateCreated = int(rest_api_results["dateCreated"])
191 self.changes = [PhabChange(c) for c in rest_api_results["changes"]]
192
193
194class ReviewsCache(PhabObjectCache):
195 def __init__(self):
196 PhabObjectCache.__init__(self, PhabReview)
197
198
199class UsersCache(PhabObjectCache):
200 def __init__(self):
201 PhabObjectCache.__init__(self, PhabUser)
202
203
204reviews_cache = ReviewsCache()
205users_cache = UsersCache()
206
207
208def init_phab_connection():
209 phab = Phabricator()
210 phab.update_interfaces()
211 return phab
212
213
214def update_cached_info(phab, cache, phab_query, order, record_results,
215 max_nr_entries_per_fetch, max_nr_days_to_cache):
216 q = phab
217 LIMIT = max_nr_entries_per_fetch
218 for query_step in phab_query:
219 q = getattr(q, query_step)
220 results = q(order=order, limit=LIMIT)
221 most_recent_info, oldest_info = record_results(cache, results, phab)
222 oldest_info_to_fetch = datetime.fromtimestamp(most_recent_info) - \
223 timedelta(days=max_nr_days_to_cache)
224 most_recent_info_overall = most_recent_info
225 cache.write_cache_to_disk()
226 after = results["cursor"]["after"]
227 print("after: {0!r}".format(after))
228 print("most_recent_info: {0}".format(
229 datetime.fromtimestamp(most_recent_info)))
230 while (after is not None
231 and datetime.fromtimestamp(oldest_info) > oldest_info_to_fetch):
232 need_more_older_data = \
233 (cache.oldest_info is None or
234 datetime.fromtimestamp(cache.oldest_info) > oldest_info_to_fetch)
235 print(("need_more_older_data={0} cache.oldest_info={1} " +
236 "oldest_info_to_fetch={2}").format(
237 need_more_older_data,
238 datetime.fromtimestamp(cache.oldest_info)
239 if cache.oldest_info is not None else None,
240 oldest_info_to_fetch))
241 need_more_newer_data = \
242 (cache.most_recent_info is None or
243 cache.most_recent_info < most_recent_info)
244 print(("need_more_newer_data={0} cache.most_recent_info={1} " +
245 "most_recent_info={2}")
246 .format(need_more_newer_data, cache.most_recent_info,
247 most_recent_info))
248 if not need_more_older_data and not need_more_newer_data:
249 break
250 results = q(order=order, after=after, limit=LIMIT)
251 most_recent_info, oldest_info = record_results(cache, results, phab)
252 after = results["cursor"]["after"]
253 print("after: {0!r}".format(after))
254 print("most_recent_info: {0}".format(
255 datetime.fromtimestamp(most_recent_info)))
256 cache.write_cache_to_disk()
257 cache.most_recent_info = most_recent_info_overall
258 if after is None:
259 # We did fetch all records. Mark the cache to contain all info since
260 # the start of time.
261 oldest_info = 0
262 cache.oldest_info = oldest_info
263 cache.write_cache_to_disk()
264
265
266def record_reviews(cache, reviews, phab):
267 most_recent_info = None
268 oldest_info = None
269 for reviewInfo in reviews["data"]:
270 if reviewInfo["type"] != "DREV":
271 continue
272 id = reviewInfo["id"]
273 # phid = reviewInfo["phid"]
274 dateModified = int(reviewInfo["fields"]["dateModified"])
275 dateCreated = int(reviewInfo["fields"]["dateCreated"])
276 title = reviewInfo["fields"]["title"]
277 author = reviewInfo["fields"]["authorPHID"]
278 phabReview = cache.get(id)
279 if "dateModified" not in phabReview.__dict__ or \
280 dateModified > phabReview.dateModified:
281 diff_results = phab.differential.querydiffs(revisionIDs=[id])
282 diff_ids = sorted(diff_results.keys())
283 phabDiffs = []
284 for diff_id in diff_ids:
285 diffInfo = diff_results[diff_id]
286 d = PhabDiff(diff_id)
287 d.update(diffInfo)
288 phabDiffs.append(d)
289 phabReview.update(title, dateCreated, dateModified, author)
290 phabReview.setPhabDiffs(phabDiffs)
291 print("Updated D{0} modified on {1} ({2} diffs)".format(
292 id, datetime.fromtimestamp(dateModified), len(phabDiffs)))
293
294 if most_recent_info is None:
295 most_recent_info = dateModified
296 elif most_recent_info < dateModified:
297 most_recent_info = dateModified
298
299 if oldest_info is None:
300 oldest_info = dateModified
301 elif oldest_info > dateModified:
302 oldest_info = dateModified
303 return most_recent_info, oldest_info
304
305
306def record_users(cache, users, phab):
307 most_recent_info = None
308 oldest_info = None
309 for info in users["data"]:
310 if info["type"] != "USER":
311 continue
312 id = info["id"]
313 phid = info["phid"]
314 dateModified = int(info["fields"]["dateModified"])
315 # dateCreated = int(info["fields"]["dateCreated"])
316 realName = info["fields"]["realName"]
317 phabUser = cache.get(id)
318 phabUser.update(phid, realName)
319 if most_recent_info is None:
320 most_recent_info = dateModified
321 elif most_recent_info < dateModified:
322 most_recent_info = dateModified
323 if oldest_info is None:
324 oldest_info = dateModified
325 elif oldest_info > dateModified:
326 oldest_info = dateModified
327 return most_recent_info, oldest_info
328
329
330PHABCACHESINFO = ((reviews_cache, ("differential", "revision", "search"),
331 "updated", record_reviews, 5, 7),
332 (users_cache, ("user", "search"), "newest", record_users,
333 100, 1000))
334
335
336def load_cache():
337 for cache, phab_query, order, record_results, _, _ in PHABCACHESINFO:
338 cache.populate_cache_from_disk()
339 print("Loaded {0} nr entries: {1}".format(
340 cache.get_name(), len(cache.get_ids_in_cache())))
341 print("Loaded {0} has most recent info: {1}".format(
342 cache.get_name(),
343 datetime.fromtimestamp(cache.most_recent_info)
344 if cache.most_recent_info is not None else None))
345
346
347def update_cache(phab):
348 load_cache()
349 for cache, phab_query, order, record_results, max_nr_entries_per_fetch, \
350 max_nr_days_to_cache in PHABCACHESINFO:
351 update_cached_info(phab, cache, phab_query, order, record_results,
352 max_nr_entries_per_fetch, max_nr_days_to_cache)
353 ids_in_cache = cache.get_ids_in_cache()
354 print("{0} objects in {1}".format(len(ids_in_cache), cache.get_name()))
355 cache.write_cache_to_disk()
356
357
358def get_most_recent_reviews(days):
359 newest_reviews = sorted(
360 reviews_cache.get_objects(), key=lambda r: -r.dateModified)
361 if len(newest_reviews) == 0:
362 return newest_reviews
363 most_recent_review_time = \
364 datetime.fromtimestamp(newest_reviews[0].dateModified)
365 cut_off_date = most_recent_review_time - timedelta(days=days)
366 result = []
367 for review in newest_reviews:
368 if datetime.fromtimestamp(review.dateModified) < cut_off_date:
369 return result
370 result.append(review)
371 return result
372
373
374# All of the above code is about fetching data from Phabricator and caching it
375# on local disk. The below code contains the actual "business logic" for this
376# script.
377
378_userphid2realname = None
379
380
381def get_real_name_from_author(user_phid):
382 global _userphid2realname
383 if _userphid2realname is None:
384 _userphid2realname = {}
385 for user in users_cache.get_objects():
386 _userphid2realname[user.phid] = user.realName
387 return _userphid2realname.get(user_phid, "unknown")
388
389
390def print_most_recent_reviews(phab, days, filter_reviewers):
391 msgs = []
392
393 def add_msg(msg):
394 msgs.append(msg)
395 print(msg)
396
397 newest_reviews = get_most_recent_reviews(days)
398 add_msg("These are the reviews that look interesting to be reviewed. " +
399 "The report below has 2 sections. The first " +
400 "section is organized per review; the second section is organized "
401 + "per potential reviewer.\n")
402 oldest_review = newest_reviews[-1] if len(newest_reviews) > 0 else None
403 oldest_datetime = \
404 datetime.fromtimestamp(oldest_review.dateModified) \
405 if oldest_review else None
406 add_msg(("The report below is based on analyzing the reviews that got " +
407 "touched in the past {0} days (since {1}). " +
408 "The script found {2} such reviews.\n").format(
409 days, oldest_datetime, len(newest_reviews)))
410 reviewer2reviews_and_scores = {}
411 for i, review in enumerate(newest_reviews):
412 matched_reviewers = find_reviewers_for_review(review)
413 matched_reviewers = filter_reviewers(matched_reviewers)
414 if len(matched_reviewers) == 0:
415 continue
416 add_msg(("{0:>3}. https://reviews.llvm.org/D{1} by {2}\n {3}\n" +
417 " Last updated on {4}").format(
418 i, review.id,
419 get_real_name_from_author(review.author), review.title,
420 datetime.fromtimestamp(review.dateModified)))
421 for reviewer, scores in matched_reviewers:
422 add_msg(" potential reviewer {0}, score {1}".format(
423 reviewer,
424 "(" + "/".join(["{0:.1f}%".format(s) for s in scores]) + ")"))
425 if reviewer not in reviewer2reviews_and_scores:
426 reviewer2reviews_and_scores[reviewer] = []
427 reviewer2reviews_and_scores[reviewer].append((review, scores))
428
429 # Print out a summary per reviewer.
430 for reviewer in sorted(reviewer2reviews_and_scores.keys()):
431 reviews_and_scores = reviewer2reviews_and_scores[reviewer]
432 reviews_and_scores.sort(key=lambda rs: rs[1], reverse=True)
433 add_msg("\n\nSUMMARY FOR {0} (found {1} reviews):".format(
434 reviewer, len(reviews_and_scores)))
435 for review, scores in reviews_and_scores:
436 add_msg("[{0}] https://reviews.llvm.org/D{1} '{2}' by {3}".format(
437 "/".join(["{0:.1f}%".format(s) for s in scores]), review.id,
438 review.title, get_real_name_from_author(review.author)))
439 return "\n".join(msgs)
440
441
442def get_git_cmd_output(cmd):
443 output = None
444 try:
445 logging.debug(cmd)
446 output = subprocess.check_output(
447 cmd, shell=True, stderr=subprocess.STDOUT)
448 except subprocess.CalledProcessError as e:
449 logging.debug(str(e))
450 if output is None:
451 return None
452 return output.decode("utf-8", errors='ignore')
453
454
455reAuthorMail = re.compile("^author-mail <([^>]*)>.*$")
456
457
458def parse_blame_output_line_porcelain(blame_output):
459 email2nr_occurences = {}
460 if blame_output is None:
461 return email2nr_occurences
462 for line in blame_output.split('\n'):
463 m = reAuthorMail.match(line)
464 if m:
465 author_email_address = m.group(1)
466 if author_email_address not in email2nr_occurences:
467 email2nr_occurences[author_email_address] = 1
468 else:
469 email2nr_occurences[author_email_address] += 1
470 return email2nr_occurences
471
472
473def find_reviewers_for_diff_heuristic(diff):
474 # Heuristic 1: assume good reviewers are the ones that touched the same
475 # lines before as this patch is touching.
476 # Heuristic 2: assume good reviewers are the ones that touched the same
477 # files before as this patch is touching.
478 reviewers2nr_lines_touched = {}
479 reviewers2nr_files_touched = {}
480 # Assume last revision before diff was modified is the revision the diff
481 # applies to.
482 git_repo = "git_repos/llvm"
483 cmd = 'git -C {0} rev-list -n 1 --before="{1}" master'.format(
484 git_repo,
485 datetime.fromtimestamp(
486 diff.dateModified).strftime("%Y-%m-%d %H:%M:%s"))
487 base_revision = get_git_cmd_output(cmd).strip()
488 logging.debug("Base revision={0}".format(base_revision))
489 for change in diff.changes:
490 path = change.oldPath
491 # Compute heuristic 1: look at context of patch lines.
492 for hunk in change.hunks:
493 for start_line, end_line in hunk.actual_lines_changed_offset:
494 # Collect git blame results for authors in those ranges.
495 cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e " +
496 "-w --line-porcelain -L {1},{2} {3} -- {4}").format(
497 git_repo, start_line, end_line, base_revision, path)
498 blame_output = get_git_cmd_output(cmd)
499 for reviewer, nr_occurences in \
500 parse_blame_output_line_porcelain(blame_output).items():
501 if reviewer not in reviewers2nr_lines_touched:
502 reviewers2nr_lines_touched[reviewer] = 0
503 reviewers2nr_lines_touched[reviewer] += nr_occurences
504 # Compute heuristic 2: don't look at context, just at files touched.
505 # Collect git blame results for authors in those ranges.
506 cmd = ("git -C {0} blame --encoding=utf-8 --date iso -f -e -w " +
507 "--line-porcelain {1} -- {2}").format(git_repo, base_revision,
508 path)
509 blame_output = get_git_cmd_output(cmd)
510 for reviewer, nr_occurences in parse_blame_output_line_porcelain(
511 blame_output).items():
512 if reviewer not in reviewers2nr_files_touched:
513 reviewers2nr_files_touched[reviewer] = 0
514 reviewers2nr_files_touched[reviewer] += 1
515
516 # Compute "match scores"
517 total_nr_lines = sum(reviewers2nr_lines_touched.values())
518 total_nr_files = len(diff.changes)
519 reviewers_matchscores = \
520 [(reviewer,
521 (reviewers2nr_lines_touched.get(reviewer, 0)*100.0/total_nr_lines
522 if total_nr_lines != 0 else 0,
523 reviewers2nr_files_touched[reviewer]*100.0/total_nr_files
524 if total_nr_files != 0 else 0))
525 for reviewer, nr_lines
526 in reviewers2nr_files_touched.items()]
527 reviewers_matchscores.sort(key=lambda i: i[1], reverse=True)
528 return reviewers_matchscores
529
530
531def find_reviewers_for_review(review):
532 # Process the newest diff first.
533 diffs = sorted(
534 review.phabDiffs, key=lambda d: d.dateModified, reverse=True)
535 if len(diffs) == 0:
536 return
537 diff = diffs[0]
538 matched_reviewers = find_reviewers_for_diff_heuristic(diff)
539 # Show progress, as this is a slow operation:
540 sys.stdout.write('.')
541 sys.stdout.flush()
542 logging.debug("matched_reviewers: {0}".format(matched_reviewers))
543 return matched_reviewers
544
545
546def update_git_repos():
547 git_repos_directory = "git_repos"
548 for name, url in GIT_REPO_METADATA:
549 dirname = os.path.join(git_repos_directory, name)
550 if not os.path.exists(dirname):
551 cmd = "git clone {0} {1}".format(url, dirname)
552 output = get_git_cmd_output(cmd)
553 cmd = "git -C {0} pull --rebase".format(dirname)
554 output = get_git_cmd_output(cmd)
555
556
557def send_emails(email_addresses, msg):
558 s = smtplib.SMTP()
559 s.connect()
560 for email_address in email_addresses:
561 email_msg = email.mime.multipart.MIMEMultipart()
562 email_msg['From'] = ''
563 email_msg['To'] = email_address
564 email_msg['Subject'] = 'LLVM patches you may be able to review.'
565 email_msg.attach(email.mime.text.MIMEText(msg, 'plain'))
566 # python 3.x: s.send_message(email_msg)
567 s.sendmail(email_msg['From'], email_msg['To'], msg)
568 s.quit()
569
570
571def filter_reviewers_to_report_for(people_to_look_for):
572 # The below is just an example filter, to only report potential reviews
573 # to do for the people that will receive the report email.
574 return lambda potential_reviewers: [r for r in potential_reviewers
575 if r[0] in people_to_look_for]
576
577
578def main():
579 parser = argparse.ArgumentParser(
580 description='Match open reviews to potential reviewers.')
581 parser.add_argument(
582 '--no-update-cache',
583 dest='update_cache',
584 action='store_false',
585 default=True,
586 help='Do not update cached Phabricator objects')
587 parser.add_argument(
588 'email_addresses',
589 nargs='*',
590 help="The email addresses (as known by LLVM git) of " +
591 "the people to look for reviews for.")
592 parser.add_argument('--verbose', '-v', action='count')
593
594 args = parser.parse_args()
595
596 if args.verbose >= 1:
597 logging.basicConfig(level=logging.DEBUG)
598
599 people_to_look_for = [e.decode('utf-8') for e in args.email_addresses]
600
601 phab = init_phab_connection()
602
603 if args.update_cache:
604 update_cache(phab)
605
606 load_cache()
607 update_git_repos()
608 msg = print_most_recent_reviews(
609 phab,
610 days=1,
611 filter_reviewers=filter_reviewers_to_report_for(people_to_look_for))
612 send_emails(people_to_look_for, msg)
613
614
615if __name__ == "__main__":
616 main()