| # Copyright 2021 Google LLC |
| |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| |
| # https://www.apache.org/licenses/LICENSE-2.0 |
| |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| from enum import IntEnum |
| import json |
| from multiprocessing import Pool |
| import pandas as pd |
| import pathlib |
| import numpy as np |
| |
| BRANCH_ARTIFACTS_DIR = ( |
| pathlib.Path(__file__).parent.resolve() |
| / "googleapiclient" |
| / "discovery_cache" |
| / "documents" |
| ) |
| MAIN_ARTIFACTS_DIR = ( |
| pathlib.Path(__file__).parent.resolve() |
| / ".." |
| / "main" |
| / "googleapiclient" |
| / "discovery_cache" |
| / "documents" |
| ) |
| |
| MULTIPROCESSING_NUM_PER_BATCH = 5 |
| MULTIPROCESSING_NUM_AGENTS = 10 |
| |
| |
| class ChangeType(IntEnum): |
| UNKNOWN = 0 |
| DELETED = 1 |
| ADDED = 2 |
| CHANGED = 3 |
| |
| |
| class DirectoryDoesNotExist(ValueError): |
| """Raised when the specified directory does not exist.""" |
| |
| pass |
| |
| |
| class ChangeSummary: |
| """Represents the change summary between 2 directories containing \ |
| artifacts. |
| """ |
| |
| def __init__(self, new_artifacts_dir, current_artifacts_dir, temp_dir, file_list): |
| """Initializes an instance of a ChangeSummary. |
| |
| Args: |
| new_artifacts_dir (str): The relative path to the directory with the |
| new discovery artifacts. |
| current_artifacts_dir (str): The relative path to the directory with |
| the current discovery artifacts. |
| temp_dir (str): The relative path to the directory used for |
| temporary storage where intermediate files will be stored. |
| file_list (list): A list of strings containing files to analyze. |
| """ |
| |
| self._file_list = file_list |
| self._new_artifacts_dir = pathlib.Path(new_artifacts_dir) |
| self._current_artifacts_dir = pathlib.Path(current_artifacts_dir) |
| self._temp_dir = pathlib.Path(temp_dir) |
| |
| # Sanity checks to ensure directories exist |
| self._raise_if_directory_not_found(self._new_artifacts_dir) |
| self._raise_if_directory_not_found(self._current_artifacts_dir) |
| self._raise_if_directory_not_found(self._temp_dir) |
| |
| def _raise_if_directory_not_found(self, directory): |
| """Raises if the `directory` doesn't exist |
| |
| args: |
| directory (str): The relative path to the `directory` |
| """ |
| |
| if not pathlib.Path(directory).exists(): |
| raise DirectoryDoesNotExist( |
| "Directory does not exist : {0}".format(directory) |
| ) |
| |
| def _load_json_to_dataframe(self, file_path): |
| """Returns a pandas dataframe from the json file provided. |
| |
| args: |
| file_path (str): The relative path to the discovery artifact to |
| parse. |
| """ |
| |
| # Create an empty dataframe as we will need to return it if the file |
| # doesn't exist |
| dataframe_doc = pd.DataFrame() |
| |
| if pathlib.Path(file_path).is_file(): |
| with open(file_path, "r") as f: |
| # Now load the json file into a pandas dataframe as a flat table |
| dataframe_doc = pd.json_normalize(json.load(f)) |
| return dataframe_doc |
| |
| def _get_discovery_differences(self, filename): |
| """Returns a pandas dataframe which contains the differences with the |
| current and new discovery artifact directories, corresponding to the |
| file name provided. |
| |
| args: |
| filename (str): The name of the discovery artifact to parse. |
| """ |
| # The paths of the 2 discovery artifacts to compare |
| current_artifact_path = self._current_artifacts_dir / filename |
| new_artifact_path = self._new_artifacts_dir / filename |
| |
| # Use a helper functions to load the discovery artifacts into pandas |
| # dataframes |
| current_doc = self._load_json_to_dataframe(current_artifact_path) |
| new_doc = self._load_json_to_dataframe(new_artifact_path) |
| |
| # Concatenate the 2 dataframes, transpose them, and create |
| # a new dataframe called combined_docs with columns |
| # `Key`, `CurrentValue`, `NewValue`. |
| combined_docs = ( |
| pd.concat([current_doc, new_doc], keys=["CurrentValue", "NewValue"]) |
| # Drop the index column |
| .reset_index(drop=True, level=1) |
| # Transpose the DataFrame, Resulting Columns should be |
| # ["Key", "CurrentValue", "New Value"] |
| .rename_axis(["Key"], axis=1).transpose() |
| # Drop the index column |
| .reset_index() |
| ) |
| |
| # When discovery documents are added, the column `CurrentValue` will |
| # not exist. In that case, we'll just populate with `np.nan`. |
| if "CurrentValue" not in combined_docs.columns: |
| combined_docs["CurrentValue"] = np.nan |
| |
| # When discovery documents are deleted, the column `NewValue` will |
| # not exist. In that case, we'll just populate with `np.nan`. |
| if "NewValue" not in combined_docs.columns: |
| combined_docs["NewValue"] = np.nan |
| |
| # Split the Key into 2 columns for `Parent` and `Child` in order |
| # to group keys with the same parents together to summarize the changes |
| # by parent. |
| parent_child_df = combined_docs["Key"].str.rsplit(".", 1, expand=True) |
| # Rename the columns and join them with the combined_docs dataframe. |
| # If we only have a `Parent` column, it means that the Key doesn't have |
| # any children. |
| if len(parent_child_df.columns) == 1: |
| parent_child_df.columns = ["Parent"] |
| else: |
| parent_child_df.columns = ["Parent", "Child"] |
| combined_docs = combined_docs.join(parent_child_df) |
| |
| # Create a new column `Added` to identify rows which have new keys. |
| combined_docs["Added"] = np.where( |
| combined_docs["CurrentValue"].isnull(), True, False |
| ) |
| |
| # Create a new column `Deleted` to identify rows which have deleted keys. |
| combined_docs["Deleted"] = np.where( |
| combined_docs["NewValue"].isnull(), True, False |
| ) |
| |
| # Aggregate the keys added by grouping keys with the same parents |
| # together to summarize the changes by parent rather than by key. |
| parent_added_agg = ( |
| combined_docs.groupby("Parent") |
| .Added.value_counts(normalize=True) |
| .reset_index(name="Proportion") |
| ) |
| |
| # Add a column NumLevels to inicate the number of levels in the tree |
| # which will allow us to sort the parents in hierarchical order. |
| parent_added_agg["NumLevels"] = ( |
| parent_added_agg["Parent"].str.split(".").apply(lambda x: len(x)) |
| ) |
| |
| # Aggregate the keys deleted by grouping keys with the same parents |
| # together to summarize the changes by parent rather than by key. |
| parent_deleted_agg = ( |
| combined_docs.groupby("Parent") |
| .Deleted.value_counts(normalize=True) |
| .reset_index(name="Proportion") |
| ) |
| |
| # Add a column NumLevels to inicate the number of levels in the tree |
| # which will allow us to sort the parents in hierarchical order. |
| parent_deleted_agg["NumLevels"] = ( |
| parent_added_agg["Parent"].str.split(".").apply(lambda x: len(x)) |
| ) |
| |
| # Create a list of all parents that have been added in hierarchical |
| # order. When `Proportion` is 1, it means that the parent is new as all |
| # children keys have been added. |
| all_added = ( |
| parent_added_agg[ |
| (parent_added_agg["Proportion"] == 1) |
| & (parent_added_agg["Added"] == True) |
| ][["Parent", "NumLevels"]] |
| .sort_values("NumLevels", ascending=True) |
| .Parent.to_list() |
| ) |
| |
| # Create a list of all parents that have been deleted in hierarchical |
| # order. When `Proportion` is 1, it means that the parent is new as all |
| # children keys have been deleted. |
| all_deleted = ( |
| parent_deleted_agg[ |
| (parent_deleted_agg["Proportion"] == 1) |
| & (parent_deleted_agg["Deleted"] == True) |
| ][["Parent", "NumLevels"]] |
| .sort_values("NumLevels", ascending=True) |
| .Parent.to_list() |
| ) |
| |
| # Go through the list of parents that have been added. If we find any |
| # keys with parents which are a substring of the parent in this list, |
| # then it means that the entire parent is new. We don't need verbose |
| # information about the children, so we replace the parent. |
| for i in range(0, len(all_added)): |
| word = all_added[i] |
| combined_docs.Parent = np.where( |
| combined_docs["Parent"].str.startswith(word), word, combined_docs.Parent |
| ) |
| |
| # Go through the list of parents that have been deleted. If we find any |
| # keys with parents which are a substring of the parent in this list, |
| # then it means that the entire parent is deleted. We don't need verbose |
| # information about the children, so we replace the parent. |
| for i in range(0, len(all_deleted)): |
| word = all_deleted[i] |
| combined_docs.Parent = np.where( |
| combined_docs["Parent"].str.startswith(word), word, combined_docs.Parent |
| ) |
| |
| # Create a new dataframe with only the keys which have changed |
| docs_diff = combined_docs[ |
| combined_docs["CurrentValue"] != combined_docs["NewValue"] |
| ].copy(deep=False) |
| |
| # Get the API and Version from the file name but exclude the extension. |
| api_version_string = filename.split(".")[:-1] |
| # Create columns `Name` and `Version` using the version string |
| docs_diff["Name"] = api_version_string[0] |
| docs_diff["Version"] = ".".join(api_version_string[1:]) |
| |
| # These conditions are used as arguments in the `np.where` function |
| # below. |
| deleted_condition = docs_diff["NewValue"].isnull() |
| added_condition = docs_diff["CurrentValue"].isnull() |
| |
| # Create a new `ChangeType` column. The `np.where()` function is like a |
| # tenary operator. When the `deleted_condition` is `True`, the |
| # `ChangeType` will be `ChangeType.Deleted`. If the added_condition is |
| # `True` the `ChangeType` will be `ChangeType.Added`, otherwise the |
| # `ChangeType` will be `ChangeType.Changed`. |
| docs_diff["ChangeType"] = np.where( |
| deleted_condition, |
| ChangeType.DELETED, |
| np.where(added_condition, ChangeType.ADDED, ChangeType.CHANGED), |
| ) |
| |
| # Filter out keys which rarely affect functionality. For example: |
| # {"description", "documentation", "enum", "etag", "revision", "title", |
| # "url", "rootUrl"} |
| docs_diff = docs_diff[ |
| ~docs_diff["Key"].str.contains( |
| "|".join(self._get_keys_to_ignore()), case=False |
| ) |
| ] |
| |
| # Group keys with similar parents together and create a new column |
| # called 'Count' which indicates the number of keys that have been |
| # grouped together. The reason for the count column is that when keys |
| # have the same parent, we group them together to improve readability. |
| docs_diff_with_count = ( |
| docs_diff.groupby( |
| ["Parent", "Added", "Deleted", "Name", "Version", "ChangeType"] |
| ) |
| .size() |
| .reset_index(name="Count") |
| ) |
| |
| # Add counts column |
| docs_diff = docs_diff.merge(docs_diff_with_count) |
| |
| # When the count is greater than 1, update the key with the name of the |
| # parent since we are consolidating keys with the same parent. |
| docs_diff.loc[docs_diff["Count"] > 1, "Key"] = docs_diff["Parent"] |
| |
| return docs_diff[ |
| ["Key", "Added", "Deleted", "Name", "Version", "ChangeType", "Count"] |
| ].drop_duplicates() |
| |
| def _build_summary_message(self, api_name, is_feature): |
| """Returns a string containing the summary for a given api. The string |
| returned will be in the format `fix(<api_name>): update the API` |
| when `is_feature=False` and `feat(<api_name>)!: update the API` |
| when `is_feature=True`. |
| |
| args: |
| api_name (str): The name of the api to include in the summary. |
| is_feature (bool): If True, include the prefix `feat` otherwise use |
| `fix` |
| """ |
| |
| # Build the conventional commit string based on the arguments provided |
| commit_type = "feat" if is_feature else "fix" |
| return "{0}({1}): update the api".format(commit_type, api_name) |
| |
| def _get_keys_to_ignore(self): |
| """Returns a list of strings with keys to ignore because they rarely |
| affect functionality. |
| |
| args: None |
| """ |
| keys_to_ignore = [ |
| "description", |
| "documentation", |
| "enum", |
| "etag", |
| "revision", |
| "title", |
| "url", |
| "rootUrl", |
| ] |
| return keys_to_ignore |
| |
| def _get_stable_versions(self, versions): |
| """Returns a pandas series `pd.Series()` of boolean values, |
| corresponding to the given series, indicating whether the version is |
| considered stable or not. |
| args: |
| versions (object): a pandas series containing version |
| information for all discovery artifacts. |
| """ |
| # Use a regex on the version to find versions with the pattern |
| # <v>.<0-9>.<0-9>.<0-9> . Any api that matches this pattern will be |
| # labeled as stable. In other words, v1, v1.4 and v1.4.5 is stable |
| # but v1b1 v1aplha and v1beta1 is not stable. |
| return versions.str.extract(r"(v\d?\.?\d?\.?\d+$)").notnull() |
| |
| def _get_summary_and_write_to_disk(self, dataframe, directory): |
| """Writes summary information to file about changes made to discovery |
| artifacts based on the provided dataframe and returns a dataframe |
| with the same. The file `'allapis.dataframe'` is saved to the current |
| working directory. |
| args: |
| dataframe (object): a pandas dataframe containing summary change |
| information for all discovery artifacts |
| directory (str): path where the summary file should be saved |
| """ |
| |
| dataframe["IsStable"] = self._get_stable_versions(dataframe["Version"]) |
| |
| # Create a filter for features, which contains only rows which have keys |
| # that have been deleted or added, that will be used as an argument in |
| # the `np.where()` call below. |
| filter_features = (dataframe["ChangeType"] == ChangeType.DELETED) | ( |
| dataframe["ChangeType"] == ChangeType.ADDED |
| ) |
| |
| # Create a new column `IsFeature` to indicate which rows should be |
| # considered as features. |
| dataframe["IsFeature"] = np.where(filter_features, True, np.nan) |
| |
| # Create a new column `IsFeatureAggregate` which will be used to |
| # summarize the api changes. We can either have feature or fix but not |
| # both. |
| dataframe["IsFeatureAggregate"] = dataframe.groupby("Name").IsFeature.transform( |
| lambda x: x.any() |
| ) |
| |
| # Create a new column `Summary`, which will contain a string with the |
| # conventional commit message. |
| dataframe["Summary"] = np.vectorize(self._build_summary_message)( |
| dataframe["Name"], dataframe["IsFeatureAggregate"] |
| ) |
| |
| # Write the final dataframe to disk as it will be used in the |
| # buildprbody.py script |
| dataframe.to_csv(directory / "allapis.dataframe") |
| return dataframe |
| |
| def _write_verbose_changes_to_disk(self, dataframe, directory, summary_df): |
| """Writes verbose information to file about changes made to discovery |
| artifacts based on the provided dataframe. A separate file is saved |
| for each api in the current working directory. The extension of the |
| files will be `'.verbose'`. |
| |
| args: |
| dataframe (object): a pandas dataframe containing verbose change |
| information for all discovery artifacts |
| directory (str): path where the summary file should be saved |
| summary_df (object): A dataframe containing a summary of the changes |
| """ |
| # Array of strings which will contains verbose change information for |
| # each api |
| verbose_changes = [] |
| |
| # Sort the dataframe to minimize file operations below. |
| dataframe.sort_values( |
| by=["Name", "Version", "ChangeType"], ascending=True, inplace=True |
| ) |
| |
| # Select only the relevant columns. We need to create verbose output |
| # by Api Name, Version and ChangeType so we need to group by these |
| # columns. |
| |
| change_type_groups = dataframe[ |
| ["Name", "Version", "ChangeType", "Key", "Count"] |
| ].groupby(["Name", "Version", "ChangeType"]) |
| |
| lastApi = "" |
| lastVersion = "" |
| lastType = ChangeType.UNKNOWN |
| |
| f = None |
| for name, group in change_type_groups: |
| currentApi = name[0] |
| currentVersion = name[1] |
| currentType = name[2] |
| |
| # We need to handing file opening and closing when processing an API |
| # which is different from the previous one |
| if lastApi != currentApi: |
| # If we are processing a new api, close the file used for |
| # processing the previous API |
| if f is not None: |
| f.writelines(verbose_changes) |
| f.close() |
| f = None |
| # Clear the array of strings with information from the previous |
| # api and reset the last version |
| verbose_changes = [] |
| lastVersion = "" |
| # Create a file which contains verbose changes for the current |
| # API being processed |
| filename = "{0}.verbose".format(currentApi) |
| f = open(pathlib.Path(directory / filename), "a") |
| lastApi = currentApi |
| |
| # Create a filter with only the rows for the current API |
| current_api_filter = summary_df["Name"] == currentApi |
| |
| # Get the string in the `Summary` column for the current api and |
| # append it to `verbose_changes`. The `Summary` column contains |
| # the conventional commit message. Use pandas.Series.iloc[0] to |
| # retrieve only the first elemnt, since all the values in the |
| # summary column are the same for a given API. |
| verbose_changes.append(summary_df[current_api_filter].Summary.iloc[0]) |
| |
| # If the version has changed, we need to create append a new heading |
| # in the verbose summary which contains the api and version. |
| if lastVersion != currentVersion: |
| # Append a header string with the API and version |
| verbose_changes.append( |
| "\n\n#### {0}:{1}\n\n".format(currentApi, currentVersion) |
| ) |
| |
| lastVersion = currentVersion |
| lastType = ChangeType.UNKNOWN |
| |
| # Whenever the change type is different, we need to create a new |
| # heading for the group of keys with the same change type. |
| if currentType != lastType: |
| if currentType == ChangeType.DELETED: |
| verbose_changes.append("\nThe following keys were deleted:\n") |
| elif currentType == ChangeType.ADDED: |
| verbose_changes.append("\nThe following keys were added:\n") |
| else: |
| verbose_changes.append("\nThe following keys were changed:\n") |
| |
| lastType = currentType |
| |
| # Append the keys, and corresponding count, in the same change |
| # type group. |
| verbose_changes.extend( |
| [ |
| "- {0} (Total Keys: {1})\n".format(row["Key"], row["Count"]) |
| for index, row in group[["Key", "Count"]].iterrows() |
| ] |
| ) |
| |
| # Make sure to close the last file and write the changes. |
| if f is not None: |
| f.writelines(verbose_changes) |
| f.close() |
| f = None |
| |
| def detect_discovery_changes(self): |
| """Writes a summary of the changes to the discovery artifacts to disk |
| at the path specified in `temp_dir`. |
| |
| args: None |
| """ |
| result = pd.DataFrame() |
| # Process files in parallel to improve performance |
| with Pool(processes=MULTIPROCESSING_NUM_AGENTS) as pool: |
| result = result.append( |
| pool.map( |
| self._get_discovery_differences, |
| self._file_list, |
| MULTIPROCESSING_NUM_PER_BATCH, |
| ) |
| ) |
| |
| if len(result): |
| # Sort the resulting dataframe by `Name`, `Version`, `ChangeType` |
| # and `Key` |
| sort_columns = ["Name", "Version", "ChangeType", "Key"] |
| result.sort_values(by=sort_columns, ascending=True, inplace=True) |
| |
| # Create a folder which be used by the `createcommits.sh` and |
| # `buildprbody.py` scripts. |
| pathlib.Path(self._temp_dir).mkdir(exist_ok=True) |
| |
| # Create a summary which contains a conventional commit message |
| # for each API and write it to disk. |
| summary_df = self._get_summary_and_write_to_disk(result, self._temp_dir) |
| |
| # Create verbose change information for each API which contains |
| # a list of changes by key and write it to disk. |
| self._write_verbose_changes_to_disk(result, self._temp_dir, summary_df) |