pw_tokenizer: domain and database creation support

Support new argumens to the pw_tokenizer_database template.

- create: Create a new CSV or binary database instead of updating an
  existing file. This could be used, for example, to generate a binary
  version of an in-tree CSV database.
- domain: Extract tokens from the specified tokenization domain.

Change-Id: I0c91176f99ec07bc9f690d2a6c542eca606c88a3
Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/15962
Commit-Queue: Wyatt Hepler <hepler@google.com>
Reviewed-by: Keir Mierle <keir@google.com>
diff --git a/pw_tokenizer/database.gni b/pw_tokenizer/database.gni
index 0cad012..c7b46f9 100644
--- a/pw_tokenizer/database.gni
+++ b/pw_tokenizer/database.gni
@@ -25,7 +25,12 @@
 # created as a starting point.
 #
 # Args:
-#   database: source tree path to database file to update; must exist beforehand
+#   database: if updating a database, path to an existing database in the source
+#       tree; optional if creating a database, but may provide an output
+#       directory path to override the default of
+#       "$target_gen_dir/$target_name.[csv/binary]"
+#   create: if specified, create a database instead of updating one; 'create'
+#       must be set to one of the supported database types: "csv" or "binary"
 #   targets: GN targets (executables or libraries) from which to add tokens;
 #       these targets are added to deps
 #   optional_targets: GN targets from which to add tokens, if the output files
@@ -33,11 +38,27 @@
 #   input_databases: paths to other database files from which to add tokens
 #   deps: GN targets to build prior to generating the database; artifacts from
 #       these targets are NOT implicitly used for database generation
+#   domain: if provided, extract strings from tokenization domains matching this
+#       regular expression
 #
 template("pw_tokenizer_database") {
-  assert(defined(invoker.database),
-         "pw_tokenizer_database requires a 'database' variable")
+  assert(defined(invoker.database) || defined(invoker.create),
+         "pw_tokenizer_database requires a 'database' variable, unless " +
+             "'create' is specified")
 
+  if (defined(invoker.create)) {
+    assert(invoker.create == "csv" || invoker.create == "binary",
+           "If provided, 'create' must be \"csv\" or \"binary\"")
+    _create = invoker.create
+  } else {
+    _create = ""
+  }
+
+  if (defined(invoker.database)) {
+    _database = invoker.database
+  } else {
+    _database = "$target_gen_dir/$target_name.${invoker.create}"
+  }
   if (defined(invoker.targets)) {
     _targets = invoker.targets
   } else {
@@ -62,23 +83,50 @@
           "pw_tokenizer_database! At least one target or database must be " +
           "provided as an input.")
 
+  if (defined(invoker.domain)) {
+    _domain = "#" + invoker.domain
+  } else {
+    _domain = ""
+  }
+
   pw_python_script(target_name) {
     script = "$dir_pw_tokenizer/py/pw_tokenizer/database.py"
-    args = [
-      "add",
+
+    inputs = _input_databases
+
+    if (_create == "") {
+      args = [ "add" ]
+      inputs += [ _database ]
+
+      # Since the output file is in the source tree, create a corresponding
+      # stamp file in the output directory that is independent of the toolchain.
+      # That way, updating the database from multiple toolchains is an error.
+      stamp =
+          "$root_build_dir/" + rebase_path(invoker.database, "//") + ".update"
+    } else {
+      args = [
+        "create",
+        "--force",
+        "--type",
+        _create,
+      ]
+      outputs = [ _database ]
+    }
+
+    args += [
       "--database",
-      rebase_path(invoker.database),
+      rebase_path(_database),
     ]
     args += rebase_path(_input_databases)
 
     foreach(target, _targets) {
-      args += [ "<TARGET_FILE($target)>" ]
+      args += [ "<TARGET_FILE($target)>$_domain" ]
     }
 
     # For optional targets, the build outputs may not exist, since they aren't
     # added to deps. Use TARGET_FILE_IF_EXISTS to handle this.
     foreach(target, _optional_targets) {
-      args += [ "<TARGET_FILE_IF_EXISTS($target)>" ]
+      args += [ "<TARGET_FILE_IF_EXISTS($target)>$_domain" ]
     }
 
     deps = _targets
@@ -86,12 +134,5 @@
     if (defined(invoker.deps)) {
       deps += invoker.deps
     }
-
-    inputs = [ invoker.database ] + _input_databases
-
-    # Since the output file is in the source tree, create a corresponding stamp
-    # file in the output directory that is independent of the toolchain. That
-    # way, trying to update the database from multiple toolchains is an error.
-    stamp = "$root_build_dir/" + rebase_path(invoker.database, "//") + ".update"
   }
 }
diff --git a/pw_tokenizer/docs.rst b/pw_tokenizer/docs.rst
index 48d66a7..72a8040 100644
--- a/pw_tokenizer/docs.rst
+++ b/pw_tokenizer/docs.rst
@@ -485,10 +485,15 @@
 
 GN integration
 ^^^^^^^^^^^^^^
-Token databases may be updated as part of a GN build. The
+Token databases may be updated or created as part of a GN build. The
 ``pw_tokenizer_database`` template provided by ``dir_pw_tokenizer/database.gni``
-automatically updates a tokenized strings database in the source tree with
-artifacts from one or more GN targets or other database files.
+automatically updates an in-source tokenized strings database or creates a new
+database with artifacts from one or more GN targets or other database files.
+
+To create a new database, set the ``create`` variable to the desired database
+type (``"csv"`` or ``"binary"``). The database will be created in the output
+directory. To update an existing database, provide the path to the database with
+the ``database`` variable.
 
 Each database in the source tree can only be updated from a single
 ``pw_tokenizer_database`` rule. Updating the same database in multiple rules