pw_tokenizer: Support database text replacements

Support a --replace argument for database.py that replaces text in
the tokenized string database. This can be used to scrub potentially
senstive terms (e.g. code names) from token databases, which may be
distributed.

Change-Id: Ie5530e09e9ea1c92894914426fddd1278ce93661
Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/19020
Reviewed-by: Keir Mierle <keir@google.com>
Commit-Queue: Wyatt Hepler <hepler@google.com>
diff --git a/pw_tokenizer/py/database_test.py b/pw_tokenizer/py/database_test.py
index cba994f..d79b765 100755
--- a/pw_tokenizer/py/database_test.py
+++ b/pw_tokenizer/py/database_test.py
@@ -193,6 +193,14 @@
         self.assertIn(REPORT_DEFAULT_DOMAIN, mock_stdout.buffer.getvalue())
         self.assertIn(REPORT_TEST_DOMAIN, mock_stdout.buffer.getvalue())
 
+    def test_replace(self):
+        sub = 'replace/ment'
+        run_cli('create', '--database', self._csv, ELF, '--replace',
+                r'(?i)\b[jh]ello\b/' + sub)
+        self.assertEqual(
+            CSV_DEFAULT_DOMAIN.replace('Jello', sub).replace('Hello', sub),
+            self._csv.read_text())
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/pw_tokenizer/py/pw_tokenizer/database.py b/pw_tokenizer/py/pw_tokenizer/database.py
index f5ccf9c..3a41f34 100755
--- a/pw_tokenizer/py/pw_tokenizer/database.py
+++ b/pw_tokenizer/py/pw_tokenizer/database.py
@@ -27,7 +27,7 @@
 import re
 import struct
 import sys
-from typing import Callable, Dict, Iterable, List, Set
+from typing import Callable, Dict, Iterable, List, Pattern, Set, Tuple
 
 try:
     from pw_tokenizer import elf_reader, tokens
@@ -143,7 +143,8 @@
     }
 
 
-def _handle_create(databases, database, force, output_type, include, exclude):
+def _handle_create(databases, database, force, output_type, include, exclude,
+                   replace):
     """Creates a token database file from one or more ELF files."""
 
     if database == '-':
@@ -156,7 +157,7 @@
         fd = open(database, 'wb')
 
     database = tokens.Database.merged(*databases)
-    database.filter(include, exclude)
+    database.filter(include, exclude, replace)
 
     with fd:
         if output_type == 'csv':
@@ -366,18 +367,47 @@
         '-i',
         '--include',
         type=re.compile,
+        default=[],
         action='append',
-        help=(
-            'If provided, at least one of these regular expressions must match '
-            'for a string to be included in the database.'))
+        help=('If provided, at least one of these regular expressions must '
+              'match for a string to be included in the database.'))
     subparser.add_argument(
         '-e',
         '--exclude',
         type=re.compile,
+        default=[],
         action='append',
         help=('If provided, none of these regular expressions may match for a '
               'string to be included in the database.'))
 
+    unescaped_slash = re.compile(r'(?<!\\)/')
+
+    def replacement(value: str) -> Tuple[Pattern, 'str']:
+        try:
+            find, sub = unescaped_slash.split(value, 1)
+        except ValueError as err:
+            raise argparse.ArgumentTypeError(
+                'replacements must be specified as "search_regex/replacement"')
+
+        try:
+            return re.compile(find.replace(r'\/', '/')), sub
+        except re.error as err:
+            raise argparse.ArgumentTypeError(
+                f'"{value}" is not a valid regular expression: {err}')
+
+    subparser.add_argument(
+        '--replace',
+        type=replacement,
+        default=[],
+        action='append',
+        help=('If provided, replaces text that matches a regular expression. '
+              'This can be used to replace sensitive terms in a token '
+              'database that will be distributed publicly. The expression and '
+              'replacement are specified as "search_regex/replacement". '
+              'Plain slash characters in the regex must be escaped with a '
+              r'backslash (\/). The replacement text may include '
+              'backreferences for captured groups in the regex.'))
+
     # The 'add' command adds strings to a database from a set of ELFs.
     subparser = subparsers.add_parser(
         'add',
diff --git a/pw_tokenizer/py/pw_tokenizer/tokens.py b/pw_tokenizer/py/pw_tokenizer/tokens.py
index 685d074..15d5c13 100644
--- a/pw_tokenizer/py/pw_tokenizer/tokens.py
+++ b/pw_tokenizer/py/pw_tokenizer/tokens.py
@@ -22,7 +22,7 @@
 import re
 import struct
 from typing import BinaryIO, Callable, Dict, Iterable, List, NamedTuple
-from typing import Optional, Tuple, Union, ValuesView
+from typing import Optional, Pattern, Tuple, Union, ValuesView
 
 DATE_FORMAT = '%Y-%m-%d'
 
@@ -242,12 +242,18 @@
                 else:
                     self._database[key] = entry
 
-    def filter(self, include: Iterable = (), exclude: Iterable = ()) -> None:
+    def filter(
+        self,
+        include: Iterable[Union[str, Pattern[str]]] = (),
+        exclude: Iterable[Union[str, Pattern[str]]] = (),
+        replace: Iterable[Tuple[Union[str, Pattern[str]], str]] = ()
+    ) -> None:
         """Filters the database using regular expressions (strings or compiled).
 
     Args:
       include: iterable of regexes; only entries matching at least one are kept
       exclude: iterable of regexes; entries matching any of these are removed
+      replace: iterable of (regex, str); replaces matching terms in all entries
     """
         self._cache = None
 
@@ -261,13 +267,18 @@
 
         if exclude:
             exclude_re = [re.compile(pattern) for pattern in exclude]
-            to_delete.extend(key for key, val in self._database.items()  #
-                             if any(
-                                 rgx.search(val.string) for rgx in exclude_re))
+            to_delete.extend(key for key, val in self._database.items() if any(
+                rgx.search(val.string) for rgx in exclude_re))
 
         for key in to_delete:
             del self._database[key]
 
+        for search, replacement in replace:
+            search = re.compile(search)
+
+            for value in self._database.values():
+                value.string = search.sub(replacement, value.string)
+
     def __len__(self) -> int:
         """Returns the number of entries in the database."""
         return len(self.entries())