blob: badb7b8b123891ff57f827dd18cc2c34a65608ef [file] [log] [blame]
Anthonios Partheniouaff037a2021-04-21 11:00:09 -04001# Copyright 2021 Google LLC
2
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6
7# https://www.apache.org/licenses/LICENSE-2.0
8
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15from enum import IntEnum
16import json
17from multiprocessing import Pool
18import pandas as pd
19import pathlib
20import numpy as np
21
22BRANCH_ARTIFACTS_DIR = (
23 pathlib.Path(__file__).parent.resolve() / "googleapiclient" / "discovery_cache" / "documents"
24)
25MAIN_ARTIFACTS_DIR = (
26 pathlib.Path(__file__).parent.resolve() / ".." / "main" / "googleapiclient" / "discovery_cache" / "documents"
27)
28
29MULTIPROCESSING_NUM_PER_BATCH = 5
30MULTIPROCESSING_NUM_AGENTS = 10
31
32
33class ChangeType(IntEnum):
34 UNKNOWN = 0
35 DELETED = 1
36 ADDED = 2
37 CHANGED = 3
38
39
40class DirectoryDoesNotExist(ValueError):
41 """Raised when the specified directory does not exist."""
42
43 pass
44
45
46class ChangeSummary:
47 """Represents the change summary between 2 directories containing \
48 artifacts.
49 """
50
51 def __init__(self, new_artifacts_dir, current_artifacts_dir, temp_dir, file_list):
52 """Initializes an instance of a ChangeSummary.
53
54 Args:
55 new_artifacts_dir (str): The relative path to the directory with the
56 new discovery artifacts.
57 current_artifacts_dir (str): The relative path to the directory with
58 the current discovery artifacts.
59 temp_dir (str): The relative path to the directory used for
60 temporary storage where intermediate files will be stored.
61 file_list (list): A list of strings containing files to analyze.
62 """
63
64 self._file_list = file_list
65 self._new_artifacts_dir = pathlib.Path(new_artifacts_dir)
66 self._current_artifacts_dir = pathlib.Path(current_artifacts_dir)
67 self._temp_dir = pathlib.Path(temp_dir)
68
69 # Sanity checks to ensure directories exist
70 self._raise_if_directory_not_found(self._new_artifacts_dir)
71 self._raise_if_directory_not_found(self._current_artifacts_dir)
72 self._raise_if_directory_not_found(self._temp_dir)
73
74 def _raise_if_directory_not_found(self, directory):
75 """Raises if the `directory` doesn't exist
76
77 args:
78 directory (str): The relative path to the `directory`
79 """
80
81 if not pathlib.Path(directory).exists():
82 raise DirectoryDoesNotExist(
83 "Directory does not exist : {0}".format(directory)
84 )
85
86 def _load_json_to_dataframe(self, file_path):
87 """Returns a pandas dataframe from the json file provided.
88
89 args:
90 file_path (str): The relative path to the discovery artifact to
91 parse.
92 """
93
94 # Create an empty dataframe as we will need to return it if the file
95 # doesn't exist
96 dataframe_doc = pd.DataFrame()
97
98 if pathlib.Path(file_path).is_file():
99 with open(file_path, "r") as f:
100 # Now load the json file into a pandas dataframe as a flat table
101 dataframe_doc = pd.json_normalize(json.load(f))
102 return dataframe_doc
103
104 def _get_discovery_differences(self, filename):
105 """Returns a pandas dataframe which contains the differences with the
106 current and new discovery artifact directories, corresponding to the
107 file name provided.
108
109 args:
110 filename (str): The name of the discovery artifact to parse.
111 """
112 # The paths of the 2 discovery artifacts to compare
113 current_artifact_path = self._current_artifacts_dir / filename
114 new_artifact_path = self._new_artifacts_dir / filename
115
116 # Use a helper functions to load the discovery artifacts into pandas
117 # dataframes
118 current_doc = self._load_json_to_dataframe(current_artifact_path)
119 new_doc = self._load_json_to_dataframe(new_artifact_path)
120
121 # Concatenate the 2 dataframes, transpose them, and create
122 # a new dataframe called combined_docs with columns
123 # `Key`, `CurrentValue`, `NewValue`.
124 combined_docs = (
125 pd.concat([current_doc, new_doc], keys=["CurrentValue", "NewValue"])
126 # Drop the index column
127 .reset_index(drop=True,level=1)
128 # Transpose the DataFrame, Resulting Columns should be
129 # ["Key", "CurrentValue", "New Value"]
130 .rename_axis(['Key'], axis=1)
131 .transpose()
132 # Drop the index column
133 .reset_index()
134 )
135
136 # When discovery documents are added, the column `CurrentValue` will
137 # not exist. In that case, we'll just populate with `np.nan`.
138 if "CurrentValue" not in combined_docs.columns:
139 combined_docs["CurrentValue"] = np.nan
140
141 # When discovery documents are deleted, the column `NewValue` will
142 # not exist. In that case, we'll just populate with `np.nan`.
143 if "NewValue" not in combined_docs.columns:
144 combined_docs["NewValue"] = np.nan
145
146 # Split the Key into 2 columns for `Parent` and `Child` in order
147 # to group keys with the same parents together to summarize the changes
148 # by parent.
149 parent_child_df = combined_docs["Key"].str.rsplit(".", 1, expand=True)
150 # Rename the columns and join them with the combined_docs dataframe.
151 # If we only have a `Parent` column, it means that the Key doesn't have
152 # any children.
153 if len(parent_child_df.columns) == 1:
154 parent_child_df.columns = ["Parent"]
155 else:
156 parent_child_df.columns = ["Parent", "Child"]
157 combined_docs = combined_docs.join(parent_child_df)
158
159 # Create a new column `Added` to identify rows which have new keys.
160 combined_docs["Added"] = np.where(
161 combined_docs["CurrentValue"].isnull(), True, False
162 )
163
164 # Create a new column `Deleted` to identify rows which have deleted keys.
165 combined_docs["Deleted"] = np.where(
166 combined_docs["NewValue"].isnull(), True, False
167 )
168
169 # Aggregate the keys added by grouping keys with the same parents
170 # together to summarize the changes by parent rather than by key.
171 parent_added_agg = (
172 combined_docs.groupby("Parent")
173 .Added.value_counts(normalize=True)
174 .reset_index(name="Proportion")
175 )
176
177 # Add a column NumLevels to inicate the number of levels in the tree
178 # which will allow us to sort the parents in hierarchical order.
179 parent_added_agg["NumLevels"] = (
180 parent_added_agg["Parent"].str.split(".").apply(lambda x: len(x))
181 )
182
183 # Aggregate the keys deleted by grouping keys with the same parents
184 # together to summarize the changes by parent rather than by key.
185 parent_deleted_agg = (
186 combined_docs.groupby("Parent")
187 .Deleted.value_counts(normalize=True)
188 .reset_index(name="Proportion")
189 )
190
191 # Add a column NumLevels to inicate the number of levels in the tree
192 # which will allow us to sort the parents in hierarchical order.
193 parent_deleted_agg["NumLevels"] = (
194 parent_added_agg["Parent"].str.split(".").apply(lambda x: len(x))
195 )
196
197 # Create a list of all parents that have been added in hierarchical
198 # order. When `Proportion` is 1, it means that the parent is new as all
199 # children keys have been added.
200 all_added = (
201 parent_added_agg[
202 (parent_added_agg["Proportion"] == 1) & (parent_added_agg["Added"] == True)
203 ][["Parent", "NumLevels"]]
204 .sort_values("NumLevels", ascending=True)
205 .Parent.to_list()
206 )
207
208 # Create a list of all parents that have been deleted in hierarchical
209 # order. When `Proportion` is 1, it means that the parent is new as all
210 # children keys have been deleted.
211 all_deleted = (
212 parent_deleted_agg[
213 (parent_deleted_agg["Proportion"] == 1)
214 & (parent_deleted_agg["Deleted"] == True)
215 ][["Parent", "NumLevels"]]
216 .sort_values("NumLevels", ascending=True)
217 .Parent.to_list()
218 )
219
220 # Go through the list of parents that have been added. If we find any
221 # keys with parents which are a substring of the parent in this list,
222 # then it means that the entire parent is new. We don't need verbose
223 # information about the children, so we replace the parent.
224 for i in range(0, len(all_added)):
225 word = all_added[i]
226 combined_docs.Parent = np.where(
227 combined_docs["Parent"].str.startswith(word), word, combined_docs.Parent
228 )
229
230 # Go through the list of parents that have been deleted. If we find any
231 # keys with parents which are a substring of the parent in this list,
232 # then it means that the entire parent is deleted. We don't need verbose
233 # information about the children, so we replace the parent.
234 for i in range(0, len(all_deleted)):
235 word = all_deleted[i]
236 combined_docs.Parent = np.where(
237 combined_docs["Parent"].str.startswith(word), word, combined_docs.Parent
238 )
239
240 # Create a new dataframe with only the keys which have changed
241 docs_diff = combined_docs[
242 combined_docs["CurrentValue"] != combined_docs["NewValue"]
243 ].copy(deep=False)
244
245 # Get the API and Version from the file name but exclude the extension.
246 api_version_string = filename.split(".")[:-1]
247 # Create columns `Name` and `Version` using the version string
248 docs_diff["Name"] = api_version_string[0]
249 docs_diff["Version"] = ".".join(api_version_string[1:])
250
251 # These conditions are used as arguments in the `np.where` function
252 # below.
253 deleted_condition = docs_diff["NewValue"].isnull()
254 added_condition = docs_diff["CurrentValue"].isnull()
255
256 # Create a new `ChangeType` column. The `np.where()` function is like a
257 # tenary operator. When the `deleted_condition` is `True`, the
258 # `ChangeType` will be `ChangeType.Deleted`. If the added_condition is
259 # `True` the `ChangeType` will be `ChangeType.Added`, otherwise the
260 # `ChangeType` will be `ChangeType.Changed`.
261 docs_diff["ChangeType"] = np.where(
262 deleted_condition,
263 ChangeType.DELETED,
264 np.where(added_condition, ChangeType.ADDED, ChangeType.CHANGED),
265 )
266
267 # Filter out keys which rarely affect functionality. For example:
268 # {"description", "documentation", "enum", "etag", "revision", "title",
269 # "url", "rootUrl"}
270 docs_diff = docs_diff[
271 ~docs_diff["Key"].str.contains(
272 "|".join(self._get_keys_to_ignore()), case=False
273 )
274 ]
275
276 # Group keys with similar parents together and create a new column
277 # called 'Count' which indicates the number of keys that have been
278 # grouped together. The reason for the count column is that when keys
279 # have the same parent, we group them together to improve readability.
280 docs_diff = (
281 docs_diff.groupby(
282 ["Parent", "Added", "Deleted", "Name", "Version", "ChangeType"]
283 )
284 .size()
285 .reset_index(name="Count")[
286 ["Parent", "Added", "Deleted", "Name", "Version", "ChangeType", "Count"]
287 ]
288 )
289
290 # Rename the Parent column to the Key Column since we are reporting
291 # summary information of keys with the same parent.
292 docs_diff.rename(columns={"Parent": "Key"}, inplace=True)
293 return docs_diff
294
295 def _build_summary_message(self, api_name, is_feature):
296 """Returns a string containing the summary for a given api. The string
297 returned will be in the format `fix(<api_name>): update the API`
298 when `is_feature=False` and `feat(<api_name>)!: update the API`
299 when `is_feature=True`.
300
301 args:
302 api_name (str): The name of the api to include in the summary.
303 is_feature (bool): If True, include the prefix `feat` otherwise use
304 `fix`
305 """
306
307 # Build the conventional commit string based on the arguments provided
308 commit_type = "feat" if is_feature else "fix"
309 return "{0}({1}): update the api".format(commit_type, api_name)
310
311 def _get_keys_to_ignore(self):
312 """Returns a list of strings with keys to ignore because they rarely
313 affect functionality.
314
315 args: None
316 """
317 keys_to_ignore = [
318 "description",
319 "documentation",
320 "enum",
321 "etag",
322 "revision",
323 "title",
324 "url",
325 "rootUrl",
326 ]
327 return keys_to_ignore
328
329 def _get_stable_versions(self, versions):
330 """ Returns a pandas series `pd.Series()` of boolean values,
331 corresponding to the given series, indicating whether the version is
332 considered stable or not.
333 args:
334 versions (object): a pandas series containing version
335 information for all discovery artifacts.
336 """
337 # Use a regex on the version to find versions with the pattern
338 # <v>.<0-9>.<0-9>.<0-9> . Any api that matches this pattern will be
339 # labeled as stable. In other words, v1, v1.4 and v1.4.5 is stable
340 # but v1b1 v1aplha and v1beta1 is not stable.
341 return versions.str.extract(r"(v\d?\.?\d?\.?\d+$)").notnull()
342
343 def _get_summary_and_write_to_disk(self, dataframe, directory):
344 """Writes summary information to file about changes made to discovery
345 artifacts based on the provided dataframe and returns a dataframe
346 with the same. The file `'allapis.summary'` is saved to the current
347 working directory.
348 args:
349 dataframe (object): a pandas dataframe containing summary change
350 information for all discovery artifacts
351 directory (str): path where the summary file should be saved
352 """
353
354 dataframe["IsStable"] = self._get_stable_versions(dataframe["Version"])
355
356 # Create a filter for features, which contains only rows which have keys
357 # that have been deleted or added, that will be used as an argument in
358 # the `np.where()` call below.
359 filter_features = (dataframe["ChangeType"] == ChangeType.DELETED) | (
360 dataframe["ChangeType"] == ChangeType.ADDED
361 )
362
363 # Create a new column `IsFeature` to indicate which rows should be
364 # considered as features.
365 dataframe["IsFeature"] = np.where(filter_features, True, np.nan)
366
367 # Create a new column `IsFeatureAggregate` which will be used to
368 # summarize the api changes. We can either have feature or fix but not
369 # both.
370 dataframe["IsFeatureAggregate"] = dataframe.groupby("Name").IsFeature.transform(
371 lambda x: x.any()
372 )
373
374 # Create a new column `Summary`, which will contain a string with the
375 # conventional commit message.
376 dataframe["Summary"] = np.vectorize(self._build_summary_message)(
377 dataframe["Name"],
378 dataframe["IsFeatureAggregate"]
379 )
380
381 # Write the final dataframe to disk as it will be used in the
382 # buildprbody.py script
383 dataframe.to_csv(directory / "allapis.dataframe")
384 return dataframe
385
386 def _write_verbose_changes_to_disk(self, dataframe, directory, summary_df):
387 """ Writes verbose information to file about changes made to discovery
388 artifacts based on the provided dataframe. A separate file is saved
389 for each api in the current working directory. The extension of the
390 files will be `'.verbose'`.
391
392 args:
393 dataframe (object): a pandas dataframe containing verbose change
394 information for all discovery artifacts
395 directory (str): path where the summary file should be saved
396 summary_df (object): A dataframe containing a summary of the changes
397 """
398 # Array of strings which will contains verbose change information for
399 # each api
400 verbose_changes = []
401
402 # Sort the dataframe to minimize file operations below.
403 dataframe.sort_values(by=["Name","Version","ChangeType"],
404 ascending=True, inplace=True)
405
406 # Select only the relevant columns. We need to create verbose output
407 # by Api Name, Version and ChangeType so we need to group by these
408 # columns.
409
410 change_type_groups = dataframe[
411 ["Name", "Version", "ChangeType", "Key", "Count"]
412 ].groupby(["Name", "Version", "ChangeType"])
413
414 lastApi = ""
415 lastVersion = ""
416 lastType = ChangeType.UNKNOWN
417
418 f = None
419 for name, group in change_type_groups:
420 currentApi = name[0]
421 currentVersion = name[1]
422 currentType = name[2]
423
424 # We need to handing file opening and closing when processing an API
425 # which is different from the previous one
426 if lastApi != currentApi:
427 # If we are processing a new api, close the file used for
428 # processing the previous API
429 if f is not None:
430 f.writelines(verbose_changes)
431 f.close()
432 f = None
433 # Clear the array of strings with information from the previous
434 # api and reset the last version
435 verbose_changes = []
436 lastVersion = ''
437 # Create a file which contains verbose changes for the current
438 # API being processed
439 filename = "{0}.verbose".format(currentApi)
440 f = open(pathlib.Path(directory / filename), "a")
441 lastApi = currentApi
442
443 # Create a filter with only the rows for the current API
444 current_api_filter = summary_df["Name"] == currentApi
445
446 # Get the string in the `Summary` column for the current api and
447 # append it to `verbose_changes`. The `Summary` column contains
448 # the conventional commit message. Use pandas.Series.iloc[0] to
449 # retrieve only the first elemnt, since all the values in the
450 # summary column are the same for a given API.
451 verbose_changes.append(summary_df[current_api_filter].Summary.iloc[0])
452
453
454 # If the version has changed, we need to create append a new heading
455 # in the verbose summary which contains the api and version.
456 if lastVersion != currentVersion:
457 # Append a header string with the API and version
458 verbose_changes.append("\n\n#### {0}:{1}\n\n".format(currentApi, currentVersion))
459
460 lastVersion = currentVersion
461 lastType = ChangeType.UNKNOWN
462
463 # Whenever the change type is different, we need to create a new
464 # heading for the group of keys with the same change type.
465 if currentType != lastType:
466 if currentType == ChangeType.DELETED:
467 verbose_changes.append("\nThe following keys were deleted:\n")
468 elif currentType == ChangeType.ADDED:
469 verbose_changes.append("\nThe following keys were added:\n")
470 else:
471 verbose_changes.append("\nThe following keys were changed:\n")
472
473 lastType = currentType
474
475 # Append the keys, and corresponding count, in the same change
476 # type group.
477 verbose_changes.extend(
478 [
479 "- {0} (Total Keys: {1})\n".format(row['Key'], row['Count'])
480 for index, row in group[["Key", "Count"]].iterrows()
481 ]
482 )
483
484 # Make sure to close the last file and write the changes.
485 if f is not None:
486 f.writelines(verbose_changes)
487 f.close()
488 f = None
489
490 def detect_discovery_changes(self):
491 """Writes a summary of the changes to the discovery artifacts to disk
492 at the path specified in `temp_dir`.
493
494 args: None
495 """
496 result = pd.DataFrame()
497 # Process files in parallel to improve performance
498 with Pool(processes=MULTIPROCESSING_NUM_AGENTS) as pool:
499 result = result.append(
500 pool.map(
501 self._get_discovery_differences,
502 self._file_list,
503 MULTIPROCESSING_NUM_PER_BATCH,
504 )
505 )
506
507 if len(result):
508 # Sort the resulting dataframe by `Name`, `Version`, `ChangeType`
509 # and `Key`
510 sort_columns = ["Name", "Version", "ChangeType", "Key"]
511 result.sort_values(by=sort_columns, ascending=True, inplace=True)
512
513 # Create a folder which be used by the `createcommits.sh` and
514 # `buildprbody.py` scripts.
515 pathlib.Path(self._temp_dir).mkdir(exist_ok=True)
516
517 # Create a summary which contains a conventional commit message
518 # for each API and write it to disk.
519 summary_df = self._get_summary_and_write_to_disk(result, self._temp_dir)
520
521 # Create verbose change information for each API which contains
522 # a list of changes by key and write it to disk.
523 self._write_verbose_changes_to_disk(result, self._temp_dir, summary_df)
524