Blame - scripts/changesummary.py - platform/external/python/google-api-python-client

blob: badb7b8b123891ff57f827dd18cc2c34a65608ef [file] [log] [blame]

Anthonios Partheniou	aff037a	2021-04-21 11:00:09 -0400	[diff] [blame^]	1	# Copyright 2021 Google LLC
				2
				3	# Licensed under the Apache License, Version 2.0 (the "License");
				4	# you may not use this file except in compliance with the License.
				5	# You may obtain a copy of the License at
				6
				7	# https://www.apache.org/licenses/LICENSE-2.0
				8
				9	# Unless required by applicable law or agreed to in writing, software
				10	# distributed under the License is distributed on an "AS IS" BASIS,
				11	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	# See the License for the specific language governing permissions and
				13	# limitations under the License.
				14
				15	from enum import IntEnum
				16	import json
				17	from multiprocessing import Pool
				18	import pandas as pd
				19	import pathlib
				20	import numpy as np
				21
				22	BRANCH_ARTIFACTS_DIR = (
				23	pathlib.Path(__file__).parent.resolve() / "googleapiclient" / "discovery_cache" / "documents"
				24	)
				25	MAIN_ARTIFACTS_DIR = (
				26	pathlib.Path(__file__).parent.resolve() / ".." / "main" / "googleapiclient" / "discovery_cache" / "documents"
				27	)
				28
				29	MULTIPROCESSING_NUM_PER_BATCH = 5
				30	MULTIPROCESSING_NUM_AGENTS = 10
				31
				32
				33	class ChangeType(IntEnum):
				34	UNKNOWN = 0
				35	DELETED = 1
				36	ADDED = 2
				37	CHANGED = 3
				38
				39
				40	class DirectoryDoesNotExist(ValueError):
				41	"""Raised when the specified directory does not exist."""
				42
				43	pass
				44
				45
				46	class ChangeSummary:
				47	"""Represents the change summary between 2 directories containing \
				48	artifacts.
				49	"""
				50
				51	def __init__(self, new_artifacts_dir, current_artifacts_dir, temp_dir, file_list):
				52	"""Initializes an instance of a ChangeSummary.
				53
				54	Args:
				55	new_artifacts_dir (str): The relative path to the directory with the
				56	new discovery artifacts.
				57	current_artifacts_dir (str): The relative path to the directory with
				58	the current discovery artifacts.
				59	temp_dir (str): The relative path to the directory used for
				60	temporary storage where intermediate files will be stored.
				61	file_list (list): A list of strings containing files to analyze.
				62	"""
				63
				64	self._file_list = file_list
				65	self._new_artifacts_dir = pathlib.Path(new_artifacts_dir)
				66	self._current_artifacts_dir = pathlib.Path(current_artifacts_dir)
				67	self._temp_dir = pathlib.Path(temp_dir)
				68
				69	# Sanity checks to ensure directories exist
				70	self._raise_if_directory_not_found(self._new_artifacts_dir)
				71	self._raise_if_directory_not_found(self._current_artifacts_dir)
				72	self._raise_if_directory_not_found(self._temp_dir)
				73
				74	def _raise_if_directory_not_found(self, directory):
				75	"""Raises if the `directory` doesn't exist
				76
				77	args:
				78	directory (str): The relative path to the `directory`
				79	"""
				80
				81	if not pathlib.Path(directory).exists():
				82	raise DirectoryDoesNotExist(
				83	"Directory does not exist : {0}".format(directory)
				84	)
				85
				86	def _load_json_to_dataframe(self, file_path):
				87	"""Returns a pandas dataframe from the json file provided.
				88
				89	args:
				90	file_path (str): The relative path to the discovery artifact to
				91	parse.
				92	"""
				93
				94	# Create an empty dataframe as we will need to return it if the file
				95	# doesn't exist
				96	dataframe_doc = pd.DataFrame()
				97
				98	if pathlib.Path(file_path).is_file():
				99	with open(file_path, "r") as f:
				100	# Now load the json file into a pandas dataframe as a flat table
				101	dataframe_doc = pd.json_normalize(json.load(f))
				102	return dataframe_doc
				103
				104	def _get_discovery_differences(self, filename):
				105	"""Returns a pandas dataframe which contains the differences with the
				106	current and new discovery artifact directories, corresponding to the
				107	file name provided.
				108
				109	args:
				110	filename (str): The name of the discovery artifact to parse.
				111	"""
				112	# The paths of the 2 discovery artifacts to compare
				113	current_artifact_path = self._current_artifacts_dir / filename
				114	new_artifact_path = self._new_artifacts_dir / filename
				115
				116	# Use a helper functions to load the discovery artifacts into pandas
				117	# dataframes
				118	current_doc = self._load_json_to_dataframe(current_artifact_path)
				119	new_doc = self._load_json_to_dataframe(new_artifact_path)
				120
				121	# Concatenate the 2 dataframes, transpose them, and create
				122	# a new dataframe called combined_docs with columns
				123	# `Key`, `CurrentValue`, `NewValue`.
				124	combined_docs = (
				125	pd.concat([current_doc, new_doc], keys=["CurrentValue", "NewValue"])
				126	# Drop the index column
				127	.reset_index(drop=True,level=1)
				128	# Transpose the DataFrame, Resulting Columns should be
				129	# ["Key", "CurrentValue", "New Value"]
				130	.rename_axis(['Key'], axis=1)
				131	.transpose()
				132	# Drop the index column
				133	.reset_index()
				134	)
				135
				136	# When discovery documents are added, the column `CurrentValue` will
				137	# not exist. In that case, we'll just populate with `np.nan`.
				138	if "CurrentValue" not in combined_docs.columns:
				139	combined_docs["CurrentValue"] = np.nan
				140
				141	# When discovery documents are deleted, the column `NewValue` will
				142	# not exist. In that case, we'll just populate with `np.nan`.
				143	if "NewValue" not in combined_docs.columns:
				144	combined_docs["NewValue"] = np.nan
				145
				146	# Split the Key into 2 columns for `Parent` and `Child` in order
				147	# to group keys with the same parents together to summarize the changes
				148	# by parent.
				149	parent_child_df = combined_docs["Key"].str.rsplit(".", 1, expand=True)
				150	# Rename the columns and join them with the combined_docs dataframe.
				151	# If we only have a `Parent` column, it means that the Key doesn't have
				152	# any children.
				153	if len(parent_child_df.columns) == 1:
				154	parent_child_df.columns = ["Parent"]
				155	else:
				156	parent_child_df.columns = ["Parent", "Child"]
				157	combined_docs = combined_docs.join(parent_child_df)
				158
				159	# Create a new column `Added` to identify rows which have new keys.
				160	combined_docs["Added"] = np.where(
				161	combined_docs["CurrentValue"].isnull(), True, False
				162	)
				163
				164	# Create a new column `Deleted` to identify rows which have deleted keys.
				165	combined_docs["Deleted"] = np.where(
				166	combined_docs["NewValue"].isnull(), True, False
				167	)
				168
				169	# Aggregate the keys added by grouping keys with the same parents
				170	# together to summarize the changes by parent rather than by key.
				171	parent_added_agg = (
				172	combined_docs.groupby("Parent")
				173	.Added.value_counts(normalize=True)
				174	.reset_index(name="Proportion")
				175	)
				176
				177	# Add a column NumLevels to inicate the number of levels in the tree
				178	# which will allow us to sort the parents in hierarchical order.
				179	parent_added_agg["NumLevels"] = (
				180	parent_added_agg["Parent"].str.split(".").apply(lambda x: len(x))
				181	)
				182
				183	# Aggregate the keys deleted by grouping keys with the same parents
				184	# together to summarize the changes by parent rather than by key.
				185	parent_deleted_agg = (
				186	combined_docs.groupby("Parent")
				187	.Deleted.value_counts(normalize=True)
				188	.reset_index(name="Proportion")
				189	)
				190
				191	# Add a column NumLevels to inicate the number of levels in the tree
				192	# which will allow us to sort the parents in hierarchical order.
				193	parent_deleted_agg["NumLevels"] = (
				194	parent_added_agg["Parent"].str.split(".").apply(lambda x: len(x))
				195	)
				196
				197	# Create a list of all parents that have been added in hierarchical
				198	# order. When `Proportion` is 1, it means that the parent is new as all
				199	# children keys have been added.
				200	all_added = (
				201	parent_added_agg[
				202	(parent_added_agg["Proportion"] == 1) & (parent_added_agg["Added"] == True)
				203	][["Parent", "NumLevels"]]
				204	.sort_values("NumLevels", ascending=True)
				205	.Parent.to_list()
				206	)
				207
				208	# Create a list of all parents that have been deleted in hierarchical
				209	# order. When `Proportion` is 1, it means that the parent is new as all
				210	# children keys have been deleted.
				211	all_deleted = (
				212	parent_deleted_agg[
				213	(parent_deleted_agg["Proportion"] == 1)
				214	& (parent_deleted_agg["Deleted"] == True)
				215	][["Parent", "NumLevels"]]
				216	.sort_values("NumLevels", ascending=True)
				217	.Parent.to_list()
				218	)
				219
				220	# Go through the list of parents that have been added. If we find any
				221	# keys with parents which are a substring of the parent in this list,
				222	# then it means that the entire parent is new. We don't need verbose
				223	# information about the children, so we replace the parent.
				224	for i in range(0, len(all_added)):
				225	word = all_added[i]
				226	combined_docs.Parent = np.where(
				227	combined_docs["Parent"].str.startswith(word), word, combined_docs.Parent
				228	)
				229
				230	# Go through the list of parents that have been deleted. If we find any
				231	# keys with parents which are a substring of the parent in this list,
				232	# then it means that the entire parent is deleted. We don't need verbose
				233	# information about the children, so we replace the parent.
				234	for i in range(0, len(all_deleted)):
				235	word = all_deleted[i]
				236	combined_docs.Parent = np.where(
				237	combined_docs["Parent"].str.startswith(word), word, combined_docs.Parent
				238	)
				239
				240	# Create a new dataframe with only the keys which have changed
				241	docs_diff = combined_docs[
				242	combined_docs["CurrentValue"] != combined_docs["NewValue"]
				243	].copy(deep=False)
				244
				245	# Get the API and Version from the file name but exclude the extension.
				246	api_version_string = filename.split(".")[:-1]
				247	# Create columns `Name` and `Version` using the version string
				248	docs_diff["Name"] = api_version_string[0]
				249	docs_diff["Version"] = ".".join(api_version_string[1:])
				250
				251	# These conditions are used as arguments in the `np.where` function
				252	# below.
				253	deleted_condition = docs_diff["NewValue"].isnull()
				254	added_condition = docs_diff["CurrentValue"].isnull()
				255
				256	# Create a new `ChangeType` column. The `np.where()` function is like a
				257	# tenary operator. When the `deleted_condition` is `True`, the
				258	# `ChangeType` will be `ChangeType.Deleted`. If the added_condition is
				259	# `True` the `ChangeType` will be `ChangeType.Added`, otherwise the
				260	# `ChangeType` will be `ChangeType.Changed`.
				261	docs_diff["ChangeType"] = np.where(
				262	deleted_condition,
				263	ChangeType.DELETED,
				264	np.where(added_condition, ChangeType.ADDED, ChangeType.CHANGED),
				265	)
				266
				267	# Filter out keys which rarely affect functionality. For example:
				268	# {"description", "documentation", "enum", "etag", "revision", "title",
				269	# "url", "rootUrl"}
				270	docs_diff = docs_diff[
				271	~docs_diff["Key"].str.contains(
				272	"\|".join(self._get_keys_to_ignore()), case=False
				273	)
				274	]
				275
				276	# Group keys with similar parents together and create a new column
				277	# called 'Count' which indicates the number of keys that have been
				278	# grouped together. The reason for the count column is that when keys
				279	# have the same parent, we group them together to improve readability.
				280	docs_diff = (
				281	docs_diff.groupby(
				282	["Parent", "Added", "Deleted", "Name", "Version", "ChangeType"]
				283	)
				284	.size()
				285	.reset_index(name="Count")[
				286	["Parent", "Added", "Deleted", "Name", "Version", "ChangeType", "Count"]
				287	]
				288	)
				289
				290	# Rename the Parent column to the Key Column since we are reporting
				291	# summary information of keys with the same parent.
				292	docs_diff.rename(columns={"Parent": "Key"}, inplace=True)
				293	return docs_diff
				294
				295	def _build_summary_message(self, api_name, is_feature):
				296	"""Returns a string containing the summary for a given api. The string
				297	returned will be in the format `fix(<api_name>): update the API`
				298	when `is_feature=False` and `feat(<api_name>)!: update the API`
				299	when `is_feature=True`.
				300
				301	args:
				302	api_name (str): The name of the api to include in the summary.
				303	is_feature (bool): If True, include the prefix `feat` otherwise use
				304	`fix`
				305	"""
				306
				307	# Build the conventional commit string based on the arguments provided
				308	commit_type = "feat" if is_feature else "fix"
				309	return "{0}({1}): update the api".format(commit_type, api_name)
				310
				311	def _get_keys_to_ignore(self):
				312	"""Returns a list of strings with keys to ignore because they rarely
				313	affect functionality.
				314
				315	args: None
				316	"""
				317	keys_to_ignore = [
				318	"description",
				319	"documentation",
				320	"enum",
				321	"etag",
				322	"revision",
				323	"title",
				324	"url",
				325	"rootUrl",
				326	]
				327	return keys_to_ignore
				328
				329	def _get_stable_versions(self, versions):
				330	""" Returns a pandas series `pd.Series()` of boolean values,
				331	corresponding to the given series, indicating whether the version is
				332	considered stable or not.
				333	args:
				334	versions (object): a pandas series containing version
				335	information for all discovery artifacts.
				336	"""
				337	# Use a regex on the version to find versions with the pattern
				338	# <v>.<0-9>.<0-9>.<0-9> . Any api that matches this pattern will be
				339	# labeled as stable. In other words, v1, v1.4 and v1.4.5 is stable
				340	# but v1b1 v1aplha and v1beta1 is not stable.
				341	return versions.str.extract(r"(v\d?\.?\d?\.?\d+$)").notnull()
				342
				343	def _get_summary_and_write_to_disk(self, dataframe, directory):
				344	"""Writes summary information to file about changes made to discovery
				345	artifacts based on the provided dataframe and returns a dataframe
				346	with the same. The file `'allapis.summary'` is saved to the current
				347	working directory.
				348	args:
				349	dataframe (object): a pandas dataframe containing summary change
				350	information for all discovery artifacts
				351	directory (str): path where the summary file should be saved
				352	"""
				353
				354	dataframe["IsStable"] = self._get_stable_versions(dataframe["Version"])
				355
				356	# Create a filter for features, which contains only rows which have keys
				357	# that have been deleted or added, that will be used as an argument in
				358	# the `np.where()` call below.
				359	filter_features = (dataframe["ChangeType"] == ChangeType.DELETED) \| (
				360	dataframe["ChangeType"] == ChangeType.ADDED
				361	)
				362
				363	# Create a new column `IsFeature` to indicate which rows should be
				364	# considered as features.
				365	dataframe["IsFeature"] = np.where(filter_features, True, np.nan)
				366
				367	# Create a new column `IsFeatureAggregate` which will be used to
				368	# summarize the api changes. We can either have feature or fix but not
				369	# both.
				370	dataframe["IsFeatureAggregate"] = dataframe.groupby("Name").IsFeature.transform(
				371	lambda x: x.any()
				372	)
				373
				374	# Create a new column `Summary`, which will contain a string with the
				375	# conventional commit message.
				376	dataframe["Summary"] = np.vectorize(self._build_summary_message)(
				377	dataframe["Name"],
				378	dataframe["IsFeatureAggregate"]
				379	)
				380
				381	# Write the final dataframe to disk as it will be used in the
				382	# buildprbody.py script
				383	dataframe.to_csv(directory / "allapis.dataframe")
				384	return dataframe
				385
				386	def _write_verbose_changes_to_disk(self, dataframe, directory, summary_df):
				387	""" Writes verbose information to file about changes made to discovery
				388	artifacts based on the provided dataframe. A separate file is saved
				389	for each api in the current working directory. The extension of the
				390	files will be `'.verbose'`.
				391
				392	args:
				393	dataframe (object): a pandas dataframe containing verbose change
				394	information for all discovery artifacts
				395	directory (str): path where the summary file should be saved
				396	summary_df (object): A dataframe containing a summary of the changes
				397	"""
				398	# Array of strings which will contains verbose change information for
				399	# each api
				400	verbose_changes = []
				401
				402	# Sort the dataframe to minimize file operations below.
				403	dataframe.sort_values(by=["Name","Version","ChangeType"],
				404	ascending=True, inplace=True)
				405
				406	# Select only the relevant columns. We need to create verbose output
				407	# by Api Name, Version and ChangeType so we need to group by these
				408	# columns.
				409
				410	change_type_groups = dataframe[
				411	["Name", "Version", "ChangeType", "Key", "Count"]
				412	].groupby(["Name", "Version", "ChangeType"])
				413
				414	lastApi = ""
				415	lastVersion = ""
				416	lastType = ChangeType.UNKNOWN
				417
				418	f = None
				419	for name, group in change_type_groups:
				420	currentApi = name[0]
				421	currentVersion = name[1]
				422	currentType = name[2]
				423
				424	# We need to handing file opening and closing when processing an API
				425	# which is different from the previous one
				426	if lastApi != currentApi:
				427	# If we are processing a new api, close the file used for
				428	# processing the previous API
				429	if f is not None:
				430	f.writelines(verbose_changes)
				431	f.close()
				432	f = None
				433	# Clear the array of strings with information from the previous
				434	# api and reset the last version
				435	verbose_changes = []
				436	lastVersion = ''
				437	# Create a file which contains verbose changes for the current
				438	# API being processed
				439	filename = "{0}.verbose".format(currentApi)
				440	f = open(pathlib.Path(directory / filename), "a")
				441	lastApi = currentApi
				442
				443	# Create a filter with only the rows for the current API
				444	current_api_filter = summary_df["Name"] == currentApi
				445
				446	# Get the string in the `Summary` column for the current api and
				447	# append it to `verbose_changes`. The `Summary` column contains
				448	# the conventional commit message. Use pandas.Series.iloc[0] to
				449	# retrieve only the first elemnt, since all the values in the
				450	# summary column are the same for a given API.
				451	verbose_changes.append(summary_df[current_api_filter].Summary.iloc[0])
				452
				453
				454	# If the version has changed, we need to create append a new heading
				455	# in the verbose summary which contains the api and version.
				456	if lastVersion != currentVersion:
				457	# Append a header string with the API and version
				458	verbose_changes.append("\n\n#### {0}:{1}\n\n".format(currentApi, currentVersion))
				459
				460	lastVersion = currentVersion
				461	lastType = ChangeType.UNKNOWN
				462
				463	# Whenever the change type is different, we need to create a new
				464	# heading for the group of keys with the same change type.
				465	if currentType != lastType:
				466	if currentType == ChangeType.DELETED:
				467	verbose_changes.append("\nThe following keys were deleted:\n")
				468	elif currentType == ChangeType.ADDED:
				469	verbose_changes.append("\nThe following keys were added:\n")
				470	else:
				471	verbose_changes.append("\nThe following keys were changed:\n")
				472
				473	lastType = currentType
				474
				475	# Append the keys, and corresponding count, in the same change
				476	# type group.
				477	verbose_changes.extend(
				478	[
				479	"- {0} (Total Keys: {1})\n".format(row['Key'], row['Count'])
				480	for index, row in group[["Key", "Count"]].iterrows()
				481	]
				482	)
				483
				484	# Make sure to close the last file and write the changes.
				485	if f is not None:
				486	f.writelines(verbose_changes)
				487	f.close()
				488	f = None
				489
				490	def detect_discovery_changes(self):
				491	"""Writes a summary of the changes to the discovery artifacts to disk
				492	at the path specified in `temp_dir`.
				493
				494	args: None
				495	"""
				496	result = pd.DataFrame()
				497	# Process files in parallel to improve performance
				498	with Pool(processes=MULTIPROCESSING_NUM_AGENTS) as pool:
				499	result = result.append(
				500	pool.map(
				501	self._get_discovery_differences,
				502	self._file_list,
				503	MULTIPROCESSING_NUM_PER_BATCH,
				504	)
				505	)
				506
				507	if len(result):
				508	# Sort the resulting dataframe by `Name`, `Version`, `ChangeType`
				509	# and `Key`
				510	sort_columns = ["Name", "Version", "ChangeType", "Key"]
				511	result.sort_values(by=sort_columns, ascending=True, inplace=True)
				512
				513	# Create a folder which be used by the `createcommits.sh` and
				514	# `buildprbody.py` scripts.
				515	pathlib.Path(self._temp_dir).mkdir(exist_ok=True)
				516
				517	# Create a summary which contains a conventional commit message
				518	# for each API and write it to disk.
				519	summary_df = self._get_summary_and_write_to_disk(result, self._temp_dir)
				520
				521	# Create verbose change information for each API which contains
				522	# a list of changes by key and write it to disk.
				523	self._write_verbose_changes_to_disk(result, self._temp_dir, summary_df)
				524