Blame - scripts/changesummary.py - platform/external/python/google-api-python-client

blob: b95ea6f5d8406718660033f82f79b1b595e480d5 [file] [log] [blame]

Anthonios Partheniou	aff037a	2021-04-21 11:00:09 -0400	[diff] [blame]	1	# Copyright 2021 Google LLC
				2
				3	# Licensed under the Apache License, Version 2.0 (the "License");
				4	# you may not use this file except in compliance with the License.
				5	# You may obtain a copy of the License at
				6
				7	# https://www.apache.org/licenses/LICENSE-2.0
				8
				9	# Unless required by applicable law or agreed to in writing, software
				10	# distributed under the License is distributed on an "AS IS" BASIS,
				11	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	# See the License for the specific language governing permissions and
				13	# limitations under the License.
				14
				15	from enum import IntEnum
				16	import json
				17	from multiprocessing import Pool
				18	import pandas as pd
				19	import pathlib
				20	import numpy as np
				21
				22	BRANCH_ARTIFACTS_DIR = (
Anthonios Partheniou	6a88422	2021-05-10 22:14:57 -0400	[diff] [blame]	23	pathlib.Path(__file__).parent.resolve()
				24	/ "googleapiclient"
				25	/ "discovery_cache"
				26	/ "documents"
Anthonios Partheniou	aff037a	2021-04-21 11:00:09 -0400	[diff] [blame]	27	)
				28	MAIN_ARTIFACTS_DIR = (
Anthonios Partheniou	6a88422	2021-05-10 22:14:57 -0400	[diff] [blame]	29	pathlib.Path(__file__).parent.resolve()
				30	/ ".."
				31	/ "main"
				32	/ "googleapiclient"
				33	/ "discovery_cache"
				34	/ "documents"
Anthonios Partheniou	aff037a	2021-04-21 11:00:09 -0400	[diff] [blame]	35	)
				36
				37	MULTIPROCESSING_NUM_PER_BATCH = 5
				38	MULTIPROCESSING_NUM_AGENTS = 10
				39
				40
				41	class ChangeType(IntEnum):
				42	UNKNOWN = 0
				43	DELETED = 1
				44	ADDED = 2
				45	CHANGED = 3
				46
				47
				48	class DirectoryDoesNotExist(ValueError):
				49	"""Raised when the specified directory does not exist."""
				50
				51	pass
				52
				53
				54	class ChangeSummary:
				55	"""Represents the change summary between 2 directories containing \
				56	artifacts.
				57	"""
				58
				59	def __init__(self, new_artifacts_dir, current_artifacts_dir, temp_dir, file_list):
				60	"""Initializes an instance of a ChangeSummary.
				61
				62	Args:
				63	new_artifacts_dir (str): The relative path to the directory with the
				64	new discovery artifacts.
				65	current_artifacts_dir (str): The relative path to the directory with
				66	the current discovery artifacts.
				67	temp_dir (str): The relative path to the directory used for
				68	temporary storage where intermediate files will be stored.
				69	file_list (list): A list of strings containing files to analyze.
				70	"""
				71
				72	self._file_list = file_list
				73	self._new_artifacts_dir = pathlib.Path(new_artifacts_dir)
				74	self._current_artifacts_dir = pathlib.Path(current_artifacts_dir)
				75	self._temp_dir = pathlib.Path(temp_dir)
				76
				77	# Sanity checks to ensure directories exist
				78	self._raise_if_directory_not_found(self._new_artifacts_dir)
				79	self._raise_if_directory_not_found(self._current_artifacts_dir)
				80	self._raise_if_directory_not_found(self._temp_dir)
				81
				82	def _raise_if_directory_not_found(self, directory):
				83	"""Raises if the `directory` doesn't exist
				84
				85	args:
				86	directory (str): The relative path to the `directory`
				87	"""
				88
				89	if not pathlib.Path(directory).exists():
				90	raise DirectoryDoesNotExist(
				91	"Directory does not exist : {0}".format(directory)
				92	)
				93
				94	def _load_json_to_dataframe(self, file_path):
				95	"""Returns a pandas dataframe from the json file provided.
				96
				97	args:
				98	file_path (str): The relative path to the discovery artifact to
				99	parse.
				100	"""
				101
				102	# Create an empty dataframe as we will need to return it if the file
				103	# doesn't exist
				104	dataframe_doc = pd.DataFrame()
				105
				106	if pathlib.Path(file_path).is_file():
				107	with open(file_path, "r") as f:
				108	# Now load the json file into a pandas dataframe as a flat table
				109	dataframe_doc = pd.json_normalize(json.load(f))
				110	return dataframe_doc
				111
				112	def _get_discovery_differences(self, filename):
				113	"""Returns a pandas dataframe which contains the differences with the
				114	current and new discovery artifact directories, corresponding to the
				115	file name provided.
				116
				117	args:
				118	filename (str): The name of the discovery artifact to parse.
				119	"""
				120	# The paths of the 2 discovery artifacts to compare
				121	current_artifact_path = self._current_artifacts_dir / filename
				122	new_artifact_path = self._new_artifacts_dir / filename
				123
				124	# Use a helper functions to load the discovery artifacts into pandas
				125	# dataframes
				126	current_doc = self._load_json_to_dataframe(current_artifact_path)
				127	new_doc = self._load_json_to_dataframe(new_artifact_path)
				128
				129	# Concatenate the 2 dataframes, transpose them, and create
				130	# a new dataframe called combined_docs with columns
				131	# `Key`, `CurrentValue`, `NewValue`.
				132	combined_docs = (
				133	pd.concat([current_doc, new_doc], keys=["CurrentValue", "NewValue"])
				134	# Drop the index column
Anthonios Partheniou	6a88422	2021-05-10 22:14:57 -0400	[diff] [blame]	135	.reset_index(drop=True, level=1)
Anthonios Partheniou	aff037a	2021-04-21 11:00:09 -0400	[diff] [blame]	136	# Transpose the DataFrame, Resulting Columns should be
				137	# ["Key", "CurrentValue", "New Value"]
Anthonios Partheniou	6a88422	2021-05-10 22:14:57 -0400	[diff] [blame]	138	.rename_axis(["Key"], axis=1).transpose()
Anthonios Partheniou	aff037a	2021-04-21 11:00:09 -0400	[diff] [blame]	139	# Drop the index column
				140	.reset_index()
				141	)
				142
				143	# When discovery documents are added, the column `CurrentValue` will
				144	# not exist. In that case, we'll just populate with `np.nan`.
				145	if "CurrentValue" not in combined_docs.columns:
				146	combined_docs["CurrentValue"] = np.nan
				147
				148	# When discovery documents are deleted, the column `NewValue` will
				149	# not exist. In that case, we'll just populate with `np.nan`.
				150	if "NewValue" not in combined_docs.columns:
				151	combined_docs["NewValue"] = np.nan
				152
				153	# Split the Key into 2 columns for `Parent` and `Child` in order
				154	# to group keys with the same parents together to summarize the changes
				155	# by parent.
				156	parent_child_df = combined_docs["Key"].str.rsplit(".", 1, expand=True)
				157	# Rename the columns and join them with the combined_docs dataframe.
				158	# If we only have a `Parent` column, it means that the Key doesn't have
				159	# any children.
				160	if len(parent_child_df.columns) == 1:
				161	parent_child_df.columns = ["Parent"]
				162	else:
				163	parent_child_df.columns = ["Parent", "Child"]
				164	combined_docs = combined_docs.join(parent_child_df)
				165
				166	# Create a new column `Added` to identify rows which have new keys.
				167	combined_docs["Added"] = np.where(
				168	combined_docs["CurrentValue"].isnull(), True, False
				169	)
				170
				171	# Create a new column `Deleted` to identify rows which have deleted keys.
				172	combined_docs["Deleted"] = np.where(
				173	combined_docs["NewValue"].isnull(), True, False
				174	)
				175
				176	# Aggregate the keys added by grouping keys with the same parents
				177	# together to summarize the changes by parent rather than by key.
				178	parent_added_agg = (
				179	combined_docs.groupby("Parent")
				180	.Added.value_counts(normalize=True)
				181	.reset_index(name="Proportion")
				182	)
				183
				184	# Add a column NumLevels to inicate the number of levels in the tree
				185	# which will allow us to sort the parents in hierarchical order.
				186	parent_added_agg["NumLevels"] = (
				187	parent_added_agg["Parent"].str.split(".").apply(lambda x: len(x))
				188	)
				189
				190	# Aggregate the keys deleted by grouping keys with the same parents
				191	# together to summarize the changes by parent rather than by key.
				192	parent_deleted_agg = (
				193	combined_docs.groupby("Parent")
				194	.Deleted.value_counts(normalize=True)
				195	.reset_index(name="Proportion")
				196	)
				197
				198	# Add a column NumLevels to inicate the number of levels in the tree
				199	# which will allow us to sort the parents in hierarchical order.
				200	parent_deleted_agg["NumLevels"] = (
				201	parent_added_agg["Parent"].str.split(".").apply(lambda x: len(x))
				202	)
				203
				204	# Create a list of all parents that have been added in hierarchical
				205	# order. When `Proportion` is 1, it means that the parent is new as all
				206	# children keys have been added.
				207	all_added = (
				208	parent_added_agg[
Anthonios Partheniou	6a88422	2021-05-10 22:14:57 -0400	[diff] [blame]	209	(parent_added_agg["Proportion"] == 1)
				210	& (parent_added_agg["Added"] == True)
Anthonios Partheniou	aff037a	2021-04-21 11:00:09 -0400	[diff] [blame]	211	][["Parent", "NumLevels"]]
				212	.sort_values("NumLevels", ascending=True)
				213	.Parent.to_list()
				214	)
				215
				216	# Create a list of all parents that have been deleted in hierarchical
				217	# order. When `Proportion` is 1, it means that the parent is new as all
				218	# children keys have been deleted.
				219	all_deleted = (
				220	parent_deleted_agg[
				221	(parent_deleted_agg["Proportion"] == 1)
				222	& (parent_deleted_agg["Deleted"] == True)
				223	][["Parent", "NumLevels"]]
				224	.sort_values("NumLevels", ascending=True)
				225	.Parent.to_list()
				226	)
				227
				228	# Go through the list of parents that have been added. If we find any
				229	# keys with parents which are a substring of the parent in this list,
				230	# then it means that the entire parent is new. We don't need verbose
				231	# information about the children, so we replace the parent.
				232	for i in range(0, len(all_added)):
				233	word = all_added[i]
				234	combined_docs.Parent = np.where(
				235	combined_docs["Parent"].str.startswith(word), word, combined_docs.Parent
				236	)
				237
				238	# Go through the list of parents that have been deleted. If we find any
				239	# keys with parents which are a substring of the parent in this list,
				240	# then it means that the entire parent is deleted. We don't need verbose
				241	# information about the children, so we replace the parent.
				242	for i in range(0, len(all_deleted)):
				243	word = all_deleted[i]
				244	combined_docs.Parent = np.where(
				245	combined_docs["Parent"].str.startswith(word), word, combined_docs.Parent
				246	)
				247
				248	# Create a new dataframe with only the keys which have changed
				249	docs_diff = combined_docs[
				250	combined_docs["CurrentValue"] != combined_docs["NewValue"]
				251	].copy(deep=False)
				252
				253	# Get the API and Version from the file name but exclude the extension.
				254	api_version_string = filename.split(".")[:-1]
				255	# Create columns `Name` and `Version` using the version string
				256	docs_diff["Name"] = api_version_string[0]
				257	docs_diff["Version"] = ".".join(api_version_string[1:])
				258
				259	# These conditions are used as arguments in the `np.where` function
				260	# below.
				261	deleted_condition = docs_diff["NewValue"].isnull()
				262	added_condition = docs_diff["CurrentValue"].isnull()
				263
				264	# Create a new `ChangeType` column. The `np.where()` function is like a
				265	# tenary operator. When the `deleted_condition` is `True`, the
				266	# `ChangeType` will be `ChangeType.Deleted`. If the added_condition is
				267	# `True` the `ChangeType` will be `ChangeType.Added`, otherwise the
				268	# `ChangeType` will be `ChangeType.Changed`.
				269	docs_diff["ChangeType"] = np.where(
				270	deleted_condition,
				271	ChangeType.DELETED,
				272	np.where(added_condition, ChangeType.ADDED, ChangeType.CHANGED),
				273	)
				274
				275	# Filter out keys which rarely affect functionality. For example:
				276	# {"description", "documentation", "enum", "etag", "revision", "title",
				277	# "url", "rootUrl"}
				278	docs_diff = docs_diff[
				279	~docs_diff["Key"].str.contains(
				280	"\|".join(self._get_keys_to_ignore()), case=False
				281	)
				282	]
				283
				284	# Group keys with similar parents together and create a new column
				285	# called 'Count' which indicates the number of keys that have been
				286	# grouped together. The reason for the count column is that when keys
				287	# have the same parent, we group them together to improve readability.
Anthonios Partheniou	6a88422	2021-05-10 22:14:57 -0400	[diff] [blame]	288	docs_diff_with_count = (
Anthonios Partheniou	aff037a	2021-04-21 11:00:09 -0400	[diff] [blame]	289	docs_diff.groupby(
				290	["Parent", "Added", "Deleted", "Name", "Version", "ChangeType"]
				291	)
				292	.size()
Anthonios Partheniou	6a88422	2021-05-10 22:14:57 -0400	[diff] [blame]	293	.reset_index(name="Count")
Anthonios Partheniou	aff037a	2021-04-21 11:00:09 -0400	[diff] [blame]	294	)
				295
Anthonios Partheniou	6a88422	2021-05-10 22:14:57 -0400	[diff] [blame]	296	# Add counts column
				297	docs_diff = docs_diff.merge(docs_diff_with_count)
				298
				299	# When the count is greater than 1, update the key with the name of the
				300	# parent since we are consolidating keys with the same parent.
				301	docs_diff.loc[docs_diff["Count"] > 1, "Key"] = docs_diff["Parent"]
				302
				303	return docs_diff[
				304	["Key", "Added", "Deleted", "Name", "Version", "ChangeType", "Count"]
				305	].drop_duplicates()
Anthonios Partheniou	aff037a	2021-04-21 11:00:09 -0400	[diff] [blame]	306
				307	def _build_summary_message(self, api_name, is_feature):
				308	"""Returns a string containing the summary for a given api. The string
				309	returned will be in the format `fix(<api_name>): update the API`
				310	when `is_feature=False` and `feat(<api_name>)!: update the API`
				311	when `is_feature=True`.
				312
				313	args:
				314	api_name (str): The name of the api to include in the summary.
				315	is_feature (bool): If True, include the prefix `feat` otherwise use
				316	`fix`
				317	"""
				318
				319	# Build the conventional commit string based on the arguments provided
				320	commit_type = "feat" if is_feature else "fix"
				321	return "{0}({1}): update the api".format(commit_type, api_name)
				322
				323	def _get_keys_to_ignore(self):
				324	"""Returns a list of strings with keys to ignore because they rarely
				325	affect functionality.
				326
				327	args: None
				328	"""
				329	keys_to_ignore = [
				330	"description",
				331	"documentation",
				332	"enum",
				333	"etag",
				334	"revision",
				335	"title",
				336	"url",
				337	"rootUrl",
				338	]
				339	return keys_to_ignore
				340
				341	def _get_stable_versions(self, versions):
Anthonios Partheniou	6a88422	2021-05-10 22:14:57 -0400	[diff] [blame]	342	"""Returns a pandas series `pd.Series()` of boolean values,
Anthonios Partheniou	aff037a	2021-04-21 11:00:09 -0400	[diff] [blame]	343	corresponding to the given series, indicating whether the version is
				344	considered stable or not.
				345	args:
				346	versions (object): a pandas series containing version
				347	information for all discovery artifacts.
				348	"""
				349	# Use a regex on the version to find versions with the pattern
				350	# <v>.<0-9>.<0-9>.<0-9> . Any api that matches this pattern will be
				351	# labeled as stable. In other words, v1, v1.4 and v1.4.5 is stable
				352	# but v1b1 v1aplha and v1beta1 is not stable.
				353	return versions.str.extract(r"(v\d?\.?\d?\.?\d+$)").notnull()
				354
				355	def _get_summary_and_write_to_disk(self, dataframe, directory):
				356	"""Writes summary information to file about changes made to discovery
				357	artifacts based on the provided dataframe and returns a dataframe
Anthonios Partheniou	6a88422	2021-05-10 22:14:57 -0400	[diff] [blame]	358	with the same. The file `'allapis.dataframe'` is saved to the current
Anthonios Partheniou	aff037a	2021-04-21 11:00:09 -0400	[diff] [blame]	359	working directory.
				360	args:
				361	dataframe (object): a pandas dataframe containing summary change
				362	information for all discovery artifacts
				363	directory (str): path where the summary file should be saved
				364	"""
				365
				366	dataframe["IsStable"] = self._get_stable_versions(dataframe["Version"])
				367
				368	# Create a filter for features, which contains only rows which have keys
				369	# that have been deleted or added, that will be used as an argument in
				370	# the `np.where()` call below.
				371	filter_features = (dataframe["ChangeType"] == ChangeType.DELETED) \| (
				372	dataframe["ChangeType"] == ChangeType.ADDED
				373	)
				374
				375	# Create a new column `IsFeature` to indicate which rows should be
				376	# considered as features.
				377	dataframe["IsFeature"] = np.where(filter_features, True, np.nan)
				378
				379	# Create a new column `IsFeatureAggregate` which will be used to
				380	# summarize the api changes. We can either have feature or fix but not
				381	# both.
				382	dataframe["IsFeatureAggregate"] = dataframe.groupby("Name").IsFeature.transform(
				383	lambda x: x.any()
				384	)
				385
				386	# Create a new column `Summary`, which will contain a string with the
				387	# conventional commit message.
				388	dataframe["Summary"] = np.vectorize(self._build_summary_message)(
Anthonios Partheniou	6a88422	2021-05-10 22:14:57 -0400	[diff] [blame]	389	dataframe["Name"], dataframe["IsFeatureAggregate"]
Anthonios Partheniou	aff037a	2021-04-21 11:00:09 -0400	[diff] [blame]	390	)
				391
				392	# Write the final dataframe to disk as it will be used in the
				393	# buildprbody.py script
				394	dataframe.to_csv(directory / "allapis.dataframe")
				395	return dataframe
				396
				397	def _write_verbose_changes_to_disk(self, dataframe, directory, summary_df):
Anthonios Partheniou	6a88422	2021-05-10 22:14:57 -0400	[diff] [blame]	398	"""Writes verbose information to file about changes made to discovery
Anthonios Partheniou	aff037a	2021-04-21 11:00:09 -0400	[diff] [blame]	399	artifacts based on the provided dataframe. A separate file is saved
				400	for each api in the current working directory. The extension of the
				401	files will be `'.verbose'`.
				402
				403	args:
				404	dataframe (object): a pandas dataframe containing verbose change
				405	information for all discovery artifacts
				406	directory (str): path where the summary file should be saved
				407	summary_df (object): A dataframe containing a summary of the changes
				408	"""
				409	# Array of strings which will contains verbose change information for
				410	# each api
				411	verbose_changes = []
				412
				413	# Sort the dataframe to minimize file operations below.
Anthonios Partheniou	6a88422	2021-05-10 22:14:57 -0400	[diff] [blame]	414	dataframe.sort_values(
				415	by=["Name", "Version", "ChangeType"], ascending=True, inplace=True
				416	)
Anthonios Partheniou	aff037a	2021-04-21 11:00:09 -0400	[diff] [blame]	417
				418	# Select only the relevant columns. We need to create verbose output
				419	# by Api Name, Version and ChangeType so we need to group by these
				420	# columns.
				421
				422	change_type_groups = dataframe[
				423	["Name", "Version", "ChangeType", "Key", "Count"]
				424	].groupby(["Name", "Version", "ChangeType"])
				425
				426	lastApi = ""
				427	lastVersion = ""
				428	lastType = ChangeType.UNKNOWN
				429
				430	f = None
				431	for name, group in change_type_groups:
				432	currentApi = name[0]
				433	currentVersion = name[1]
				434	currentType = name[2]
				435
				436	# We need to handing file opening and closing when processing an API
				437	# which is different from the previous one
				438	if lastApi != currentApi:
				439	# If we are processing a new api, close the file used for
				440	# processing the previous API
				441	if f is not None:
				442	f.writelines(verbose_changes)
				443	f.close()
				444	f = None
				445	# Clear the array of strings with information from the previous
				446	# api and reset the last version
				447	verbose_changes = []
Anthonios Partheniou	6a88422	2021-05-10 22:14:57 -0400	[diff] [blame]	448	lastVersion = ""
Anthonios Partheniou	aff037a	2021-04-21 11:00:09 -0400	[diff] [blame]	449	# Create a file which contains verbose changes for the current
				450	# API being processed
				451	filename = "{0}.verbose".format(currentApi)
				452	f = open(pathlib.Path(directory / filename), "a")
				453	lastApi = currentApi
				454
				455	# Create a filter with only the rows for the current API
				456	current_api_filter = summary_df["Name"] == currentApi
				457
				458	# Get the string in the `Summary` column for the current api and
				459	# append it to `verbose_changes`. The `Summary` column contains
				460	# the conventional commit message. Use pandas.Series.iloc[0] to
				461	# retrieve only the first elemnt, since all the values in the
				462	# summary column are the same for a given API.
				463	verbose_changes.append(summary_df[current_api_filter].Summary.iloc[0])
				464
Anthonios Partheniou	aff037a	2021-04-21 11:00:09 -0400	[diff] [blame]	465	# If the version has changed, we need to create append a new heading
				466	# in the verbose summary which contains the api and version.
				467	if lastVersion != currentVersion:
				468	# Append a header string with the API and version
Anthonios Partheniou	6a88422	2021-05-10 22:14:57 -0400	[diff] [blame]	469	verbose_changes.append(
				470	"\n\n#### {0}:{1}\n\n".format(currentApi, currentVersion)
				471	)
Anthonios Partheniou	aff037a	2021-04-21 11:00:09 -0400	[diff] [blame]	472
				473	lastVersion = currentVersion
				474	lastType = ChangeType.UNKNOWN
				475
				476	# Whenever the change type is different, we need to create a new
				477	# heading for the group of keys with the same change type.
				478	if currentType != lastType:
				479	if currentType == ChangeType.DELETED:
				480	verbose_changes.append("\nThe following keys were deleted:\n")
				481	elif currentType == ChangeType.ADDED:
				482	verbose_changes.append("\nThe following keys were added:\n")
				483	else:
				484	verbose_changes.append("\nThe following keys were changed:\n")
				485
				486	lastType = currentType
				487
				488	# Append the keys, and corresponding count, in the same change
				489	# type group.
				490	verbose_changes.extend(
				491	[
Anthonios Partheniou	6a88422	2021-05-10 22:14:57 -0400	[diff] [blame]	492	"- {0} (Total Keys: {1})\n".format(row["Key"], row["Count"])
Anthonios Partheniou	aff037a	2021-04-21 11:00:09 -0400	[diff] [blame]	493	for index, row in group[["Key", "Count"]].iterrows()
				494	]
				495	)
				496
				497	# Make sure to close the last file and write the changes.
				498	if f is not None:
				499	f.writelines(verbose_changes)
				500	f.close()
				501	f = None
				502
				503	def detect_discovery_changes(self):
				504	"""Writes a summary of the changes to the discovery artifacts to disk
				505	at the path specified in `temp_dir`.
				506
				507	args: None
				508	"""
				509	result = pd.DataFrame()
				510	# Process files in parallel to improve performance
				511	with Pool(processes=MULTIPROCESSING_NUM_AGENTS) as pool:
				512	result = result.append(
				513	pool.map(
				514	self._get_discovery_differences,
				515	self._file_list,
				516	MULTIPROCESSING_NUM_PER_BATCH,
				517	)
				518	)
				519
				520	if len(result):
				521	# Sort the resulting dataframe by `Name`, `Version`, `ChangeType`
				522	# and `Key`
				523	sort_columns = ["Name", "Version", "ChangeType", "Key"]
				524	result.sort_values(by=sort_columns, ascending=True, inplace=True)
				525
				526	# Create a folder which be used by the `createcommits.sh` and
				527	# `buildprbody.py` scripts.
				528	pathlib.Path(self._temp_dir).mkdir(exist_ok=True)
				529
				530	# Create a summary which contains a conventional commit message
				531	# for each API and write it to disk.
				532	summary_df = self._get_summary_and_write_to_disk(result, self._temp_dir)
				533
				534	# Create verbose change information for each API which contains
				535	# a list of changes by key and write it to disk.
				536	self._write_verbose_changes_to_disk(result, self._temp_dir, summary_df)