blob: b95ea6f5d8406718660033f82f79b1b595e480d5 [file] [log] [blame]
Anthonios Partheniouaff037a2021-04-21 11:00:09 -04001# Copyright 2021 Google LLC
2
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6
7# https://www.apache.org/licenses/LICENSE-2.0
8
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15from enum import IntEnum
16import json
17from multiprocessing import Pool
18import pandas as pd
19import pathlib
20import numpy as np
21
22BRANCH_ARTIFACTS_DIR = (
Anthonios Partheniou6a884222021-05-10 22:14:57 -040023 pathlib.Path(__file__).parent.resolve()
24 / "googleapiclient"
25 / "discovery_cache"
26 / "documents"
Anthonios Partheniouaff037a2021-04-21 11:00:09 -040027)
28MAIN_ARTIFACTS_DIR = (
Anthonios Partheniou6a884222021-05-10 22:14:57 -040029 pathlib.Path(__file__).parent.resolve()
30 / ".."
31 / "main"
32 / "googleapiclient"
33 / "discovery_cache"
34 / "documents"
Anthonios Partheniouaff037a2021-04-21 11:00:09 -040035)
36
37MULTIPROCESSING_NUM_PER_BATCH = 5
38MULTIPROCESSING_NUM_AGENTS = 10
39
40
41class ChangeType(IntEnum):
42 UNKNOWN = 0
43 DELETED = 1
44 ADDED = 2
45 CHANGED = 3
46
47
48class DirectoryDoesNotExist(ValueError):
49 """Raised when the specified directory does not exist."""
50
51 pass
52
53
54class ChangeSummary:
55 """Represents the change summary between 2 directories containing \
56 artifacts.
57 """
58
59 def __init__(self, new_artifacts_dir, current_artifacts_dir, temp_dir, file_list):
60 """Initializes an instance of a ChangeSummary.
61
62 Args:
63 new_artifacts_dir (str): The relative path to the directory with the
64 new discovery artifacts.
65 current_artifacts_dir (str): The relative path to the directory with
66 the current discovery artifacts.
67 temp_dir (str): The relative path to the directory used for
68 temporary storage where intermediate files will be stored.
69 file_list (list): A list of strings containing files to analyze.
70 """
71
72 self._file_list = file_list
73 self._new_artifacts_dir = pathlib.Path(new_artifacts_dir)
74 self._current_artifacts_dir = pathlib.Path(current_artifacts_dir)
75 self._temp_dir = pathlib.Path(temp_dir)
76
77 # Sanity checks to ensure directories exist
78 self._raise_if_directory_not_found(self._new_artifacts_dir)
79 self._raise_if_directory_not_found(self._current_artifacts_dir)
80 self._raise_if_directory_not_found(self._temp_dir)
81
82 def _raise_if_directory_not_found(self, directory):
83 """Raises if the `directory` doesn't exist
84
85 args:
86 directory (str): The relative path to the `directory`
87 """
88
89 if not pathlib.Path(directory).exists():
90 raise DirectoryDoesNotExist(
91 "Directory does not exist : {0}".format(directory)
92 )
93
94 def _load_json_to_dataframe(self, file_path):
95 """Returns a pandas dataframe from the json file provided.
96
97 args:
98 file_path (str): The relative path to the discovery artifact to
99 parse.
100 """
101
102 # Create an empty dataframe as we will need to return it if the file
103 # doesn't exist
104 dataframe_doc = pd.DataFrame()
105
106 if pathlib.Path(file_path).is_file():
107 with open(file_path, "r") as f:
108 # Now load the json file into a pandas dataframe as a flat table
109 dataframe_doc = pd.json_normalize(json.load(f))
110 return dataframe_doc
111
112 def _get_discovery_differences(self, filename):
113 """Returns a pandas dataframe which contains the differences with the
114 current and new discovery artifact directories, corresponding to the
115 file name provided.
116
117 args:
118 filename (str): The name of the discovery artifact to parse.
119 """
120 # The paths of the 2 discovery artifacts to compare
121 current_artifact_path = self._current_artifacts_dir / filename
122 new_artifact_path = self._new_artifacts_dir / filename
123
124 # Use a helper functions to load the discovery artifacts into pandas
125 # dataframes
126 current_doc = self._load_json_to_dataframe(current_artifact_path)
127 new_doc = self._load_json_to_dataframe(new_artifact_path)
128
129 # Concatenate the 2 dataframes, transpose them, and create
130 # a new dataframe called combined_docs with columns
131 # `Key`, `CurrentValue`, `NewValue`.
132 combined_docs = (
133 pd.concat([current_doc, new_doc], keys=["CurrentValue", "NewValue"])
134 # Drop the index column
Anthonios Partheniou6a884222021-05-10 22:14:57 -0400135 .reset_index(drop=True, level=1)
Anthonios Partheniouaff037a2021-04-21 11:00:09 -0400136 # Transpose the DataFrame, Resulting Columns should be
137 # ["Key", "CurrentValue", "New Value"]
Anthonios Partheniou6a884222021-05-10 22:14:57 -0400138 .rename_axis(["Key"], axis=1).transpose()
Anthonios Partheniouaff037a2021-04-21 11:00:09 -0400139 # Drop the index column
140 .reset_index()
141 )
142
143 # When discovery documents are added, the column `CurrentValue` will
144 # not exist. In that case, we'll just populate with `np.nan`.
145 if "CurrentValue" not in combined_docs.columns:
146 combined_docs["CurrentValue"] = np.nan
147
148 # When discovery documents are deleted, the column `NewValue` will
149 # not exist. In that case, we'll just populate with `np.nan`.
150 if "NewValue" not in combined_docs.columns:
151 combined_docs["NewValue"] = np.nan
152
153 # Split the Key into 2 columns for `Parent` and `Child` in order
154 # to group keys with the same parents together to summarize the changes
155 # by parent.
156 parent_child_df = combined_docs["Key"].str.rsplit(".", 1, expand=True)
157 # Rename the columns and join them with the combined_docs dataframe.
158 # If we only have a `Parent` column, it means that the Key doesn't have
159 # any children.
160 if len(parent_child_df.columns) == 1:
161 parent_child_df.columns = ["Parent"]
162 else:
163 parent_child_df.columns = ["Parent", "Child"]
164 combined_docs = combined_docs.join(parent_child_df)
165
166 # Create a new column `Added` to identify rows which have new keys.
167 combined_docs["Added"] = np.where(
168 combined_docs["CurrentValue"].isnull(), True, False
169 )
170
171 # Create a new column `Deleted` to identify rows which have deleted keys.
172 combined_docs["Deleted"] = np.where(
173 combined_docs["NewValue"].isnull(), True, False
174 )
175
176 # Aggregate the keys added by grouping keys with the same parents
177 # together to summarize the changes by parent rather than by key.
178 parent_added_agg = (
179 combined_docs.groupby("Parent")
180 .Added.value_counts(normalize=True)
181 .reset_index(name="Proportion")
182 )
183
184 # Add a column NumLevels to inicate the number of levels in the tree
185 # which will allow us to sort the parents in hierarchical order.
186 parent_added_agg["NumLevels"] = (
187 parent_added_agg["Parent"].str.split(".").apply(lambda x: len(x))
188 )
189
190 # Aggregate the keys deleted by grouping keys with the same parents
191 # together to summarize the changes by parent rather than by key.
192 parent_deleted_agg = (
193 combined_docs.groupby("Parent")
194 .Deleted.value_counts(normalize=True)
195 .reset_index(name="Proportion")
196 )
197
198 # Add a column NumLevels to inicate the number of levels in the tree
199 # which will allow us to sort the parents in hierarchical order.
200 parent_deleted_agg["NumLevels"] = (
201 parent_added_agg["Parent"].str.split(".").apply(lambda x: len(x))
202 )
203
204 # Create a list of all parents that have been added in hierarchical
205 # order. When `Proportion` is 1, it means that the parent is new as all
206 # children keys have been added.
207 all_added = (
208 parent_added_agg[
Anthonios Partheniou6a884222021-05-10 22:14:57 -0400209 (parent_added_agg["Proportion"] == 1)
210 & (parent_added_agg["Added"] == True)
Anthonios Partheniouaff037a2021-04-21 11:00:09 -0400211 ][["Parent", "NumLevels"]]
212 .sort_values("NumLevels", ascending=True)
213 .Parent.to_list()
214 )
215
216 # Create a list of all parents that have been deleted in hierarchical
217 # order. When `Proportion` is 1, it means that the parent is new as all
218 # children keys have been deleted.
219 all_deleted = (
220 parent_deleted_agg[
221 (parent_deleted_agg["Proportion"] == 1)
222 & (parent_deleted_agg["Deleted"] == True)
223 ][["Parent", "NumLevels"]]
224 .sort_values("NumLevels", ascending=True)
225 .Parent.to_list()
226 )
227
228 # Go through the list of parents that have been added. If we find any
229 # keys with parents which are a substring of the parent in this list,
230 # then it means that the entire parent is new. We don't need verbose
231 # information about the children, so we replace the parent.
232 for i in range(0, len(all_added)):
233 word = all_added[i]
234 combined_docs.Parent = np.where(
235 combined_docs["Parent"].str.startswith(word), word, combined_docs.Parent
236 )
237
238 # Go through the list of parents that have been deleted. If we find any
239 # keys with parents which are a substring of the parent in this list,
240 # then it means that the entire parent is deleted. We don't need verbose
241 # information about the children, so we replace the parent.
242 for i in range(0, len(all_deleted)):
243 word = all_deleted[i]
244 combined_docs.Parent = np.where(
245 combined_docs["Parent"].str.startswith(word), word, combined_docs.Parent
246 )
247
248 # Create a new dataframe with only the keys which have changed
249 docs_diff = combined_docs[
250 combined_docs["CurrentValue"] != combined_docs["NewValue"]
251 ].copy(deep=False)
252
253 # Get the API and Version from the file name but exclude the extension.
254 api_version_string = filename.split(".")[:-1]
255 # Create columns `Name` and `Version` using the version string
256 docs_diff["Name"] = api_version_string[0]
257 docs_diff["Version"] = ".".join(api_version_string[1:])
258
259 # These conditions are used as arguments in the `np.where` function
260 # below.
261 deleted_condition = docs_diff["NewValue"].isnull()
262 added_condition = docs_diff["CurrentValue"].isnull()
263
264 # Create a new `ChangeType` column. The `np.where()` function is like a
265 # tenary operator. When the `deleted_condition` is `True`, the
266 # `ChangeType` will be `ChangeType.Deleted`. If the added_condition is
267 # `True` the `ChangeType` will be `ChangeType.Added`, otherwise the
268 # `ChangeType` will be `ChangeType.Changed`.
269 docs_diff["ChangeType"] = np.where(
270 deleted_condition,
271 ChangeType.DELETED,
272 np.where(added_condition, ChangeType.ADDED, ChangeType.CHANGED),
273 )
274
275 # Filter out keys which rarely affect functionality. For example:
276 # {"description", "documentation", "enum", "etag", "revision", "title",
277 # "url", "rootUrl"}
278 docs_diff = docs_diff[
279 ~docs_diff["Key"].str.contains(
280 "|".join(self._get_keys_to_ignore()), case=False
281 )
282 ]
283
284 # Group keys with similar parents together and create a new column
285 # called 'Count' which indicates the number of keys that have been
286 # grouped together. The reason for the count column is that when keys
287 # have the same parent, we group them together to improve readability.
Anthonios Partheniou6a884222021-05-10 22:14:57 -0400288 docs_diff_with_count = (
Anthonios Partheniouaff037a2021-04-21 11:00:09 -0400289 docs_diff.groupby(
290 ["Parent", "Added", "Deleted", "Name", "Version", "ChangeType"]
291 )
292 .size()
Anthonios Partheniou6a884222021-05-10 22:14:57 -0400293 .reset_index(name="Count")
Anthonios Partheniouaff037a2021-04-21 11:00:09 -0400294 )
295
Anthonios Partheniou6a884222021-05-10 22:14:57 -0400296 # Add counts column
297 docs_diff = docs_diff.merge(docs_diff_with_count)
298
299 # When the count is greater than 1, update the key with the name of the
300 # parent since we are consolidating keys with the same parent.
301 docs_diff.loc[docs_diff["Count"] > 1, "Key"] = docs_diff["Parent"]
302
303 return docs_diff[
304 ["Key", "Added", "Deleted", "Name", "Version", "ChangeType", "Count"]
305 ].drop_duplicates()
Anthonios Partheniouaff037a2021-04-21 11:00:09 -0400306
307 def _build_summary_message(self, api_name, is_feature):
308 """Returns a string containing the summary for a given api. The string
309 returned will be in the format `fix(<api_name>): update the API`
310 when `is_feature=False` and `feat(<api_name>)!: update the API`
311 when `is_feature=True`.
312
313 args:
314 api_name (str): The name of the api to include in the summary.
315 is_feature (bool): If True, include the prefix `feat` otherwise use
316 `fix`
317 """
318
319 # Build the conventional commit string based on the arguments provided
320 commit_type = "feat" if is_feature else "fix"
321 return "{0}({1}): update the api".format(commit_type, api_name)
322
323 def _get_keys_to_ignore(self):
324 """Returns a list of strings with keys to ignore because they rarely
325 affect functionality.
326
327 args: None
328 """
329 keys_to_ignore = [
330 "description",
331 "documentation",
332 "enum",
333 "etag",
334 "revision",
335 "title",
336 "url",
337 "rootUrl",
338 ]
339 return keys_to_ignore
340
341 def _get_stable_versions(self, versions):
Anthonios Partheniou6a884222021-05-10 22:14:57 -0400342 """Returns a pandas series `pd.Series()` of boolean values,
Anthonios Partheniouaff037a2021-04-21 11:00:09 -0400343 corresponding to the given series, indicating whether the version is
344 considered stable or not.
345 args:
346 versions (object): a pandas series containing version
347 information for all discovery artifacts.
348 """
349 # Use a regex on the version to find versions with the pattern
350 # <v>.<0-9>.<0-9>.<0-9> . Any api that matches this pattern will be
351 # labeled as stable. In other words, v1, v1.4 and v1.4.5 is stable
352 # but v1b1 v1aplha and v1beta1 is not stable.
353 return versions.str.extract(r"(v\d?\.?\d?\.?\d+$)").notnull()
354
355 def _get_summary_and_write_to_disk(self, dataframe, directory):
356 """Writes summary information to file about changes made to discovery
357 artifacts based on the provided dataframe and returns a dataframe
Anthonios Partheniou6a884222021-05-10 22:14:57 -0400358 with the same. The file `'allapis.dataframe'` is saved to the current
Anthonios Partheniouaff037a2021-04-21 11:00:09 -0400359 working directory.
360 args:
361 dataframe (object): a pandas dataframe containing summary change
362 information for all discovery artifacts
363 directory (str): path where the summary file should be saved
364 """
365
366 dataframe["IsStable"] = self._get_stable_versions(dataframe["Version"])
367
368 # Create a filter for features, which contains only rows which have keys
369 # that have been deleted or added, that will be used as an argument in
370 # the `np.where()` call below.
371 filter_features = (dataframe["ChangeType"] == ChangeType.DELETED) | (
372 dataframe["ChangeType"] == ChangeType.ADDED
373 )
374
375 # Create a new column `IsFeature` to indicate which rows should be
376 # considered as features.
377 dataframe["IsFeature"] = np.where(filter_features, True, np.nan)
378
379 # Create a new column `IsFeatureAggregate` which will be used to
380 # summarize the api changes. We can either have feature or fix but not
381 # both.
382 dataframe["IsFeatureAggregate"] = dataframe.groupby("Name").IsFeature.transform(
383 lambda x: x.any()
384 )
385
386 # Create a new column `Summary`, which will contain a string with the
387 # conventional commit message.
388 dataframe["Summary"] = np.vectorize(self._build_summary_message)(
Anthonios Partheniou6a884222021-05-10 22:14:57 -0400389 dataframe["Name"], dataframe["IsFeatureAggregate"]
Anthonios Partheniouaff037a2021-04-21 11:00:09 -0400390 )
391
392 # Write the final dataframe to disk as it will be used in the
393 # buildprbody.py script
394 dataframe.to_csv(directory / "allapis.dataframe")
395 return dataframe
396
397 def _write_verbose_changes_to_disk(self, dataframe, directory, summary_df):
Anthonios Partheniou6a884222021-05-10 22:14:57 -0400398 """Writes verbose information to file about changes made to discovery
Anthonios Partheniouaff037a2021-04-21 11:00:09 -0400399 artifacts based on the provided dataframe. A separate file is saved
400 for each api in the current working directory. The extension of the
401 files will be `'.verbose'`.
402
403 args:
404 dataframe (object): a pandas dataframe containing verbose change
405 information for all discovery artifacts
406 directory (str): path where the summary file should be saved
407 summary_df (object): A dataframe containing a summary of the changes
408 """
409 # Array of strings which will contains verbose change information for
410 # each api
411 verbose_changes = []
412
413 # Sort the dataframe to minimize file operations below.
Anthonios Partheniou6a884222021-05-10 22:14:57 -0400414 dataframe.sort_values(
415 by=["Name", "Version", "ChangeType"], ascending=True, inplace=True
416 )
Anthonios Partheniouaff037a2021-04-21 11:00:09 -0400417
418 # Select only the relevant columns. We need to create verbose output
419 # by Api Name, Version and ChangeType so we need to group by these
420 # columns.
421
422 change_type_groups = dataframe[
423 ["Name", "Version", "ChangeType", "Key", "Count"]
424 ].groupby(["Name", "Version", "ChangeType"])
425
426 lastApi = ""
427 lastVersion = ""
428 lastType = ChangeType.UNKNOWN
429
430 f = None
431 for name, group in change_type_groups:
432 currentApi = name[0]
433 currentVersion = name[1]
434 currentType = name[2]
435
436 # We need to handing file opening and closing when processing an API
437 # which is different from the previous one
438 if lastApi != currentApi:
439 # If we are processing a new api, close the file used for
440 # processing the previous API
441 if f is not None:
442 f.writelines(verbose_changes)
443 f.close()
444 f = None
445 # Clear the array of strings with information from the previous
446 # api and reset the last version
447 verbose_changes = []
Anthonios Partheniou6a884222021-05-10 22:14:57 -0400448 lastVersion = ""
Anthonios Partheniouaff037a2021-04-21 11:00:09 -0400449 # Create a file which contains verbose changes for the current
450 # API being processed
451 filename = "{0}.verbose".format(currentApi)
452 f = open(pathlib.Path(directory / filename), "a")
453 lastApi = currentApi
454
455 # Create a filter with only the rows for the current API
456 current_api_filter = summary_df["Name"] == currentApi
457
458 # Get the string in the `Summary` column for the current api and
459 # append it to `verbose_changes`. The `Summary` column contains
460 # the conventional commit message. Use pandas.Series.iloc[0] to
461 # retrieve only the first elemnt, since all the values in the
462 # summary column are the same for a given API.
463 verbose_changes.append(summary_df[current_api_filter].Summary.iloc[0])
464
Anthonios Partheniouaff037a2021-04-21 11:00:09 -0400465 # If the version has changed, we need to create append a new heading
466 # in the verbose summary which contains the api and version.
467 if lastVersion != currentVersion:
468 # Append a header string with the API and version
Anthonios Partheniou6a884222021-05-10 22:14:57 -0400469 verbose_changes.append(
470 "\n\n#### {0}:{1}\n\n".format(currentApi, currentVersion)
471 )
Anthonios Partheniouaff037a2021-04-21 11:00:09 -0400472
473 lastVersion = currentVersion
474 lastType = ChangeType.UNKNOWN
475
476 # Whenever the change type is different, we need to create a new
477 # heading for the group of keys with the same change type.
478 if currentType != lastType:
479 if currentType == ChangeType.DELETED:
480 verbose_changes.append("\nThe following keys were deleted:\n")
481 elif currentType == ChangeType.ADDED:
482 verbose_changes.append("\nThe following keys were added:\n")
483 else:
484 verbose_changes.append("\nThe following keys were changed:\n")
485
486 lastType = currentType
487
488 # Append the keys, and corresponding count, in the same change
489 # type group.
490 verbose_changes.extend(
491 [
Anthonios Partheniou6a884222021-05-10 22:14:57 -0400492 "- {0} (Total Keys: {1})\n".format(row["Key"], row["Count"])
Anthonios Partheniouaff037a2021-04-21 11:00:09 -0400493 for index, row in group[["Key", "Count"]].iterrows()
494 ]
495 )
496
497 # Make sure to close the last file and write the changes.
498 if f is not None:
499 f.writelines(verbose_changes)
500 f.close()
501 f = None
502
503 def detect_discovery_changes(self):
504 """Writes a summary of the changes to the discovery artifacts to disk
505 at the path specified in `temp_dir`.
506
507 args: None
508 """
509 result = pd.DataFrame()
510 # Process files in parallel to improve performance
511 with Pool(processes=MULTIPROCESSING_NUM_AGENTS) as pool:
512 result = result.append(
513 pool.map(
514 self._get_discovery_differences,
515 self._file_list,
516 MULTIPROCESSING_NUM_PER_BATCH,
517 )
518 )
519
520 if len(result):
521 # Sort the resulting dataframe by `Name`, `Version`, `ChangeType`
522 # and `Key`
523 sort_columns = ["Name", "Version", "ChangeType", "Key"]
524 result.sort_values(by=sort_columns, ascending=True, inplace=True)
525
526 # Create a folder which be used by the `createcommits.sh` and
527 # `buildprbody.py` scripts.
528 pathlib.Path(self._temp_dir).mkdir(exist_ok=True)
529
530 # Create a summary which contains a conventional commit message
531 # for each API and write it to disk.
532 summary_df = self._get_summary_and_write_to_disk(result, self._temp_dir)
533
534 # Create verbose change information for each API which contains
535 # a list of changes by key and write it to disk.
536 self._write_verbose_changes_to_disk(result, self._temp_dir, summary_df)