chore: add scripts to update discovery artifacts (#1286)

These PR add the scripts from #1187 that are needed to update discovery artifacts using a Github action. The scripts will be removed from #1187 once all of the review comments from #1187 have been resolved.

This PR adds the following files under the `scripts/` folder
- `README.md` to provide instructions on manually updating discovery artifacts and API reference documentation.
- `buildprbody.py` creates a summary of the changes detected in discovery artifacts and writes them to `allapis.summary`.
- `changesummary.py` creates verbose change information for each API with discovery artifact changes.
- `createcommits.sh` creates git commits for each API with discovery artifact changes or reference document changes.
- `updatediscoveryartifacts.py` is the python file that can be used to update discovery artifacts.

I also moved `describe.py` under the scripts folder and modified it to save the discovery artifacts that are fetched.

TODO:
- [x] Add tests for scripts
- [x] Address review comments in #1187
diff --git a/scripts/changesummary_test.py b/scripts/changesummary_test.py
new file mode 100644
index 0000000..583dbe1
--- /dev/null
+++ b/scripts/changesummary_test.py
@@ -0,0 +1,240 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ChangeSummary tests."""
+
+__author__ = "partheniou@google.com (Anthonios Partheniou)"
+
+import pathlib
+import shutil
+import unittest
+
+import pandas as pd
+
+from changesummary import ChangeSummary
+from changesummary import ChangeType
+from changesummary import DirectoryDoesNotExist
+
+SCRIPTS_DIR = pathlib.Path(__file__).parent.resolve()
+NEW_ARTIFACTS_DIR = SCRIPTS_DIR / "test_resources" / "new_artifacts_dir"
+CURRENT_ARTIFACTS_DIR = SCRIPTS_DIR / "test_resources" / "current_artifacts_dir"
+TEMP_DIR = SCRIPTS_DIR / "test_resources" / "temp"
+
+
+class TestChangeSummary(unittest.TestCase):
+    def setUp(self):
+        # Clear temporary directory
+        shutil.rmtree(TEMP_DIR, ignore_errors=True)
+        # Create temporary directory
+        pathlib.Path(TEMP_DIR).mkdir()
+
+        self.cs = ChangeSummary(NEW_ARTIFACTS_DIR, CURRENT_ARTIFACTS_DIR, TEMP_DIR, [])
+
+    def test_raises_on_directory_not_found_new_artifacts_dir(self):
+        with self.assertRaises(DirectoryDoesNotExist):
+            ChangeSummary(
+                "invalid_artifact_dir", CURRENT_ARTIFACTS_DIR, TEMP_DIR, []
+            ).detect_discovery_changes()
+
+    def test_raises_on_directory_not_found_current_artifacts_dir(self):
+        with self.assertRaises(DirectoryDoesNotExist):
+            ChangeSummary(
+                NEW_ARTIFACTS_DIR, "invalid_artifact_dir", TEMP_DIR, []
+            ).detect_discovery_changes()
+
+    def test_raises_on_directory_not_found_temp_dir(self):
+        # Remove temporary directory
+        shutil.rmtree(TEMP_DIR, ignore_errors=True)
+
+        with self.assertRaises(DirectoryDoesNotExist):
+            ChangeSummary(
+                NEW_ARTIFACTS_DIR, CURRENT_ARTIFACTS_DIR, "invalid_temp_dir", []
+            ).detect_discovery_changes()
+
+        # Create temporary directory
+        pathlib.Path(TEMP_DIR).mkdir()
+
+        ChangeSummary(
+            NEW_ARTIFACTS_DIR, CURRENT_ARTIFACTS_DIR, TEMP_DIR, []
+        ).detect_discovery_changes()
+
+    def test_raises_on_directory_not_found(self):
+        with self.assertRaises(DirectoryDoesNotExist):
+            self.cs._raise_if_directory_not_found(directory="invalid_dir")
+
+    def test_load_json_to_dataframe_returns_empty_df_if_file_path_invalid(self):
+        df = self.cs._load_json_to_dataframe(file_path="invalid_path")
+        self.assertTrue(df.empty)
+
+    def test_load_json_to_dataframe_returns_expected_data(self):
+        doc_path = NEW_ARTIFACTS_DIR / "drive.v3.json"
+        df = self.cs._load_json_to_dataframe(file_path=doc_path)
+        self.assertEqual(df["name"].iloc[0], "drive")
+        self.assertEqual(df["version"].iloc[0], "v3")
+
+    def test_get_discovery_differences_for_new_doc_returns_expected_dataframe(self):
+        df = self.cs._get_discovery_differences("drive.v3.json")
+        # Assume that `drive.v3.json` is a new discovery artifact that doesn't
+        # exist in `CURRENT_ARTIFACTS_DIR`.
+        self.assertEqual(df["Name"].iloc[0], "drive")
+        self.assertEqual(df["Version"].iloc[0], "v3")
+
+        # All rows in the dataframe should  have `True` in the `Added` column
+        # and `False` in the `Deleted` column.
+        # pd.Dataframe().all() will return `True` if all elements are `True`.
+        self.assertTrue(df["Added"].all())
+        self.assertTrue((~df["Deleted"]).all())
+
+        # There should be 74 unique key differences
+        self.assertEqual(len(df), 74)
+
+        # Expected Result for key 'schemas.File'
+        # Key            Added   Deleted  Name   Version  ChangeType  Count
+        # schemas.File   True    False    drive      v3           2    168
+        self.assertTrue(df[df["Key"] == "schemas.File"].Added.iloc[0])
+        self.assertFalse(df[df["Key"] == "schemas.File"].Deleted.iloc[0])
+        self.assertEqual(
+            df[df["Key"] == "schemas.File"].ChangeType.iloc[0], ChangeType.ADDED
+        )
+        self.assertEqual(df[df["Key"] == "schemas.File"].Count.iloc[0], 168)
+
+    def test_get_discovery_differences_for_deleted_doc_returns_expected_dataframe(self):
+        df = self.cs._get_discovery_differences("cloudtasks.v2.json")
+        # Assuming that `cloudtasks.v2.json` is a discovery artifact that doesn't
+        # exist in `NEW_ARTIFACTS_DIR`.
+        self.assertEqual(df["Name"].iloc[0], "cloudtasks")
+        self.assertEqual(df["Version"].iloc[0], "v2")
+
+        # All rows in the dataframe should have `False` in the `Added` column
+        # and `True` in the `Deleted` column.
+        # pd.Dataframe().all() will return `True` if all elements are `True`.
+        self.assertTrue((~df["Added"]).all())
+        self.assertTrue(df["Deleted"].all())
+
+        # There should be 72 unique key differences
+        self.assertEqual(len(df), 72)
+
+        # Expected Result for key 'schemas.Task'
+        # Key           Added   Deleted Name        Version  ChangeType  Count
+        # schemas.Task  False     True  cloudtasks      v2           1     18
+        self.assertFalse(df[df["Key"] == "schemas.Task"].Added.iloc[0])
+        self.assertTrue(df[df["Key"] == "schemas.Task"].Deleted.iloc[0])
+        self.assertEqual(
+            df[df["Key"] == "schemas.Task"].ChangeType.iloc[0], ChangeType.DELETED
+        )
+        self.assertEqual(df[df["Key"] == "schemas.Task"].Count.iloc[0], 18)
+
+    def test_get_discovery_differences_for_changed_doc_returns_expected_dataframe(self):
+        # Assuming that `bigquery.v2.json` is a discovery artifact has
+        # changed. There will be a mix of keys being added, changed or deleted.
+        df = self.cs._get_discovery_differences("bigquery.v2.json")
+
+        self.assertEqual(df["Name"].iloc[0], "bigquery")
+        self.assertEqual(df["Version"].iloc[0], "v2")
+
+        # There should be 28 unique key differences
+        # 11 unique keys changed, 13 unique keys added, 4 unique keys deleted
+        self.assertEqual(len(df), 28)
+        self.assertEqual(len(df[df["ChangeType"] == ChangeType.CHANGED]), 11)
+        self.assertEqual(len(df[df["ChangeType"] == ChangeType.ADDED]), 13)
+        self.assertEqual(len(df[df["ChangeType"] == ChangeType.DELETED]), 4)
+
+        # Expected Result for key 'schemas.PrincipalComponentInfo'
+        # Key                             Added  Deleted  Name     Version  ChangeType  Count
+        # schemas.PrincipalComponentInfo  False     True  bigquery v2            1     10
+        key = "schemas.PrincipalComponentInfo"
+        self.assertFalse(df[df["Key"] == key].Added.iloc[0])
+        self.assertTrue(df[df["Key"] == key].Deleted.iloc[0])
+        self.assertEqual(df[df["Key"] == key].ChangeType.iloc[0], ChangeType.DELETED)
+        self.assertEqual(df[df["Key"] == key].Count.iloc[0], 10)
+
+    def test_build_summary_message_returns_expected_result(self):
+        msg = self.cs._build_summary_message(api_name="bigquery", is_feature=True)
+        self.assertEqual(msg, "feat(bigquery): update the api")
+        msg = self.cs._build_summary_message(api_name="bigquery", is_feature=False)
+        self.assertEqual(msg, "fix(bigquery): update the api")
+
+    def test_get_stable_versions(self):
+        # These versions should be considered stable
+        s = pd.Series(["v1", "v1.4", "v1.4.5"])
+        self.assertTrue(self.cs._get_stable_versions(s).all().iloc[0])
+
+        # These versions should not be considered stable
+        s = pd.Series(["v1b1", "v1alpha", "v1beta1"])
+        self.assertTrue((~self.cs._get_stable_versions(s)).all().iloc[0])
+
+    def test_detect_discovery_changes(self):
+        files_changed = ["bigquery.v2.json", "cloudtasks.v2.json", "drive.v3.json"]
+        cs = ChangeSummary(
+            NEW_ARTIFACTS_DIR, CURRENT_ARTIFACTS_DIR, TEMP_DIR, files_changed
+        )
+        cs.detect_discovery_changes()
+        print("test")
+        result = pd.read_csv(TEMP_DIR / "allapis.dataframe")
+
+        # bigquery was added
+        # 28 key changes in total.
+        # 11 unique keys changed, 13 unique keys added, 4 unique keys deleted
+        self.assertEqual(len(result[result["Name"] == "bigquery"]), 28)
+        self.assertEqual(
+            len(result[(result["Name"] == "bigquery") & result["Added"]]), 13
+        )
+        self.assertEqual(
+            len(result[(result["Name"] == "bigquery") & result["Deleted"]]), 4
+        )
+        self.assertTrue(result[result["Name"] == "bigquery"].IsStable.all())
+        self.assertTrue(result[result["Name"] == "bigquery"].IsFeatureAggregate.all())
+        self.assertEqual(
+            result[result["Name"] == "bigquery"].Summary.iloc[0],
+            "feat(bigquery): update the api",
+        )
+
+        # cloudtasks was deleted
+        # 72 key changes in total. All 72 key changes should be deletions.
+        self.assertEqual(len(result[result["Name"] == "cloudtasks"]), 72)
+        self.assertEqual(
+            len(result[(result["Name"] == "cloudtasks") & result["Added"]]), 0
+        )
+        self.assertEqual(
+            len(result[(result["Name"] == "cloudtasks") & result["Deleted"]]), 72
+        )
+        self.assertTrue(result[(result["Name"] == "cloudtasks")].IsStable.all())
+        self.assertTrue(
+            result[(result["Name"] == "cloudtasks")].IsFeatureAggregate.all()
+        )
+        self.assertEqual(
+            result[(result["Name"] == "cloudtasks")].Summary.iloc[0],
+            "feat(cloudtasks): update the api",
+        )
+
+        # drive was updated
+        # 74 key changes in total. All 74 key changes should be additions
+        self.assertEqual(len(result[result["Name"] == "drive"]), 74)
+        self.assertEqual(
+            len(result[(result["Name"] == "drive") & result["Added"]]), 74
+        )
+        self.assertEqual(
+            len(result[(result["Name"] == "drive") & result["Deleted"]]), 0
+        )
+        self.assertTrue(result[(result["Name"] == "drive")].IsStable.all())
+        self.assertTrue(
+            result[(result["Name"] == "drive")].IsFeatureAggregate.all()
+        )
+        self.assertEqual(
+            result[(result["Name"] == "drive")].Summary.iloc[0],
+            "feat(drive): update the api",
+        )
+
+if __name__ == "__main__":
+    unittest.main()