pw_tokenizer: GN integration for token databases

- Provide the pw_tokenizer_database template that updates a token
  database in the source tree from build artifacts or other token
  databases.
- Support paths or booleans for pw_python_script's stamp argument.

Change-Id: I11a35bb77d6cfa1f328986915e3f17e9603327a5
Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/14586
Commit-Queue: Wyatt Hepler <hepler@google.com>
Reviewed-by: Alexei Frolov <frolv@google.com>
diff --git a/pw_build/docs.rst b/pw_build/docs.rst
index 05b31dc..c16eab1 100644
--- a/pw_build/docs.rst
+++ b/pw_build/docs.rst
@@ -97,9 +97,13 @@
 ``pw_python_script`` accepts all of the arguments of a regular ``action``
 target. Additionally, it has some of its own arguments:
 
-* ``stamp``: Optional boolean indicating whether to automatically create a dummy
-  output file for the script. This allows running scripts without specifying any
-  ``outputs``.
+* ``capture_output``: Optional boolean. If true, script output is hidden unless
+  the script fails with an error. Defaults to true.
+* ``stamp``: Optional variable indicating whether to automatically create a
+  dummy output file for the script. This allows running scripts without
+  specifying ``outputs``. If ``stamp`` is true, a generic output file is
+  used. If ``stamp`` is a file path, that file is used as a stamp file. Like any
+  output file, ``stamp`` must be in the build directory. Defaults to false.
 
 **Expressions**
 
diff --git a/pw_build/py/pw_build/python_runner.py b/pw_build/py/pw_build/python_runner.py
index 164b996..3679041 100755
--- a/pw_build/py/pw_build/python_runner.py
+++ b/pw_build/py/pw_build/python_runner.py
@@ -83,7 +83,7 @@
     def resolve(self, gn_path: str) -> Path:
         """Resolves a GN path to a filesystem path."""
         if gn_path.startswith('//'):
-            return self.root.joinpath(gn_path[2:]).resolve()
+            return self.root.joinpath(gn_path.lstrip('/')).resolve()
 
         return self.cwd.joinpath(gn_path).resolve()
 
diff --git a/pw_build/python_script.gni b/pw_build/python_script.gni
index ef53d19..d9ec728 100644
--- a/pw_build/python_script.gni
+++ b/pw_build/python_script.gni
@@ -65,8 +65,9 @@
 #   capture_output (=true)  If true, script output is hidden unless the script
 #                           fails with an error. Defaults to true.
 #
-#   stamp                   File to touch if the script is successful. If not
-#                           set, no file is touched.
+#   stamp                   File to touch if the script is successful. If set to
+#                           true, a generic file is used. If false or not set,
+#                           no file is touched.
 #
 template("pw_python_script") {
   assert(defined(invoker.script), "pw_python_script requires a script to run")
@@ -102,8 +103,13 @@
   }
 
   # If a stamp file is requested, add it as an output of the runner script.
-  if (defined(invoker.stamp) && invoker.stamp) {
-    _stamp_file = "$target_gen_dir/$target_name.pw_pystamp"
+  if (defined(invoker.stamp) && invoker.stamp != false) {
+    if (invoker.stamp == true) {
+      _stamp_file = "$target_gen_dir/$target_name.pw_pystamp"
+    } else {
+      _stamp_file = invoker.stamp
+    }
+
     _outputs += [ _stamp_file ]
     _script_args += [
       "--touch",
diff --git a/pw_tokenizer/database.gni b/pw_tokenizer/database.gni
new file mode 100644
index 0000000..477a763
--- /dev/null
+++ b/pw_tokenizer/database.gni
@@ -0,0 +1,74 @@
+# Copyright 2020 The Pigweed Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+
+# gn-format disable
+import("//build_overrides/pigweed.gni")
+
+import("$dir_pw_build/python_script.gni")
+
+# Updates a tokenized string database in the source tree with artifacts from one
+# or more targets. Other database files may also be used.
+#
+# The database file must exist. A CSV or binary database can be created with the
+# pw/pw_tokenizer/database.py tool. An empty CSV database file can be also
+# created as a starting point.
+#
+# Args:
+#   database: source tree path to database file to update; must exist beforehand
+#   targets: GN targets (executables or libraries) from which to add tokens
+#   input_databases: paths to other database files from which to add tokens
+#
+template("pw_tokenizer_database") {
+  assert(defined(invoker.database),
+         "pw_tokenizer_database requires a 'database' variable")
+
+  if (defined(invoker.targets)) {
+    _targets = invoker.targets
+  } else {
+    _targets = []
+  }
+
+  if (defined(invoker.input_databases)) {
+    _input_databases = invoker.input_databases
+  } else {
+    _input_databases = []
+  }
+
+  assert(
+      _targets != [] || _input_databases != [],
+      "No 'targets' or 'input_databases' were set for pw_tokenizer_database! " +
+          "At least one target or database must be provided as an input.")
+
+  pw_python_script(target_name) {
+    script = "$dir_pw_tokenizer/py/pw_tokenizer/database.py"
+    args = [
+      "add",
+      "--database",
+      rebase_path(invoker.database),
+    ]
+    args += rebase_path(_input_databases)
+
+    foreach(target, invoker.targets) {
+      args += [ "<TARGET_FILE($target)>" ]
+    }
+
+    deps = _targets
+    inputs = [ invoker.database ] + _input_databases
+
+    # Since the output file is in the source tree, create a corresponding stamp
+    # file in the output directory that is independent of the toolchain. That
+    # way, trying to update the database from multiple toolchains is an error.
+    stamp = "$root_build_dir/" + rebase_path(invoker.database, "//") + ".update"
+  }
+}
diff --git a/pw_tokenizer/docs.rst b/pw_tokenizer/docs.rst
index 2541652..9a5b369 100644
--- a/pw_tokenizer/docs.rst
+++ b/pw_tokenizer/docs.rst
@@ -483,6 +483,33 @@
 changes are made. The build system can invoke ``database.py`` to update the
 database after each build.
 
+GN integration
+^^^^^^^^^^^^^^
+Token databases may be updated as part of a GN build. The
+``pw_tokenizer_database`` template provided by ``dir_pw_tokenizer/database.gni``
+automatically updates a tokenized strings database in the source tree with
+artifacts from one or more GN targets or other database files.
+
+Each database in the source tree can only be updated from a single
+``pw_tokenizer_database`` rule. Updating the same database in multiple rules
+results in ``Duplicate output file`` GN errors or ``multiple rules generate
+<file>`` Ninja errors. To avoid these errors, ``pw_tokenizer_database`` rules
+should be defined in the default toolchain, and the input targets should be
+referenced with specific toolchains.
+
+.. code-block::
+
+  # gn-format disable
+  import("//build_overrides/pigweed.gni")
+
+  import("$dir_pw_tokenizer/database.gni")
+
+  pw_tokenizer_database("my_database") {
+    database = "database_in_the_source_tree.csv"
+    targets = [ "//firmware/image:foo(//targets/my_board:some_toolchain)" ]
+    input_databases = [ "other_database.csv" ]
+  }
+
 Detokenization
 ==============
 Detokenization is the process of expanding a token to the string it represents
diff --git a/pw_tokenizer/py/pw_tokenizer/database.py b/pw_tokenizer/py/pw_tokenizer/database.py
index 5078baf..2a18d9c 100755
--- a/pw_tokenizer/py/pw_tokenizer/database.py
+++ b/pw_tokenizer/py/pw_tokenizer/database.py
@@ -231,12 +231,16 @@
             # This is a valid path; yield it without evaluating it as a glob.
             yield Path(path_or_glob)
         else:
-            paths = glob.glob(path_or_glob)
-            if not paths:
+            paths = glob.glob(path_or_glob, recursive=True)
+
+            # If no paths were found and the path is not a glob, raise an Error.
+            if not paths and not any(c in path_or_glob for c in '*?[]!'):
                 raise FileNotFoundError(f'{path_or_glob} is not a valid path')
 
             for path in paths:
-                yield Path(path)
+                # Resolve globs to CSV or compatible binary files.
+                if elf_reader.compatible_file(path) or path.endswith('.csv'):
+                    yield Path(path)
 
 
 class ExpandGlobs(argparse.Action):
@@ -291,7 +295,8 @@
               'tokens. For ELF files, the tokenization domain to read from '
               'may specified after the path as #domain_name (e.g. '
               'foo.elf#TEST_DOMAIN). Unless specified, only the default '
-              'domain is read from ELF files; .* reads all domains.'))
+              'domain is read from ELF files; .* reads all domains. Globs are '
+              'expanded to compatible database files.'))
     return parser
 
 
diff --git a/pw_tokenizer/py/pw_tokenizer/elf_reader.py b/pw_tokenizer/py/pw_tokenizer/elf_reader.py
index 6eec96c..2a3ac3b 100755
--- a/pw_tokenizer/py/pw_tokenizer/elf_reader.py
+++ b/pw_tokenizer/py/pw_tokenizer/elf_reader.py
@@ -24,6 +24,7 @@
 """
 
 import argparse
+from pathlib import Path
 import re
 import struct
 import sys
@@ -152,12 +153,19 @@
         return False
 
 
-def compatible_file(fd: BinaryIO) -> bool:
+def compatible_file(file: Union[BinaryIO, str, Path]) -> bool:
     """True if the file type is supported (ELF or archive)."""
-    offset = fd.tell()
-    fd.seek(0)
-    result = _bytes_match(fd, ELF_MAGIC) or _bytes_match(fd, ARCHIVE_MAGIC)
-    fd.seek(offset)
+    try:
+        fd = open(file, 'rb') if isinstance(file, (str, Path)) else file
+
+        offset = fd.tell()
+        fd.seek(0)
+        result = _bytes_match(fd, ELF_MAGIC) or _bytes_match(fd, ARCHIVE_MAGIC)
+        fd.seek(offset)
+    finally:
+        if isinstance(file, (str, Path)):
+            fd.close()
+
     return result