pw_tokenizer: Support database text replacements Support a --replace argument for database.py that replaces text in the tokenized string database. This can be used to scrub potentially senstive terms (e.g. code names) from token databases, which may be distributed. Change-Id: Ie5530e09e9ea1c92894914426fddd1278ce93661 Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/19020 Reviewed-by: Keir Mierle <keir@google.com> Commit-Queue: Wyatt Hepler <hepler@google.com>

commit: 7d9cc974fbd14557e171e15e81201757334bf99a [log] [tgz]
author: Wyatt Hepler <hepler@google.com> Wed Apr 08 16:55:47 2020 -0700
committer: CQ Bot Account <commit-bot@chromium.org> Thu Oct 01 16:20:04 2020 +0000
tree: c3caf7410f7a1ac873bb602274a5b197900c5139
parent: 140c6ef4132208d8d115e9835645bbc52e33e111 [diff]
diff --git a/pw_tokenizer/py/database_test.py b/pw_tokenizer/py/database_test.py
index cba994f..d79b765 100755
--- a/pw_tokenizer/py/database_test.py
+++ b/pw_tokenizer/py/database_test.py

@@ -193,6 +193,14 @@
         self.assertIn(REPORT_DEFAULT_DOMAIN, mock_stdout.buffer.getvalue())
         self.assertIn(REPORT_TEST_DOMAIN, mock_stdout.buffer.getvalue())
 
+    def test_replace(self):
+        sub = 'replace/ment'
+        run_cli('create', '--database', self._csv, ELF, '--replace',
+                r'(?i)\b[jh]ello\b/' + sub)
+        self.assertEqual(
+            CSV_DEFAULT_DOMAIN.replace('Jello', sub).replace('Hello', sub),
+            self._csv.read_text())
+
 
 if __name__ == '__main__':
     unittest.main()

diff --git a/pw_tokenizer/py/pw_tokenizer/database.py b/pw_tokenizer/py/pw_tokenizer/database.py
index f5ccf9c..3a41f34 100755
--- a/pw_tokenizer/py/pw_tokenizer/database.py
+++ b/pw_tokenizer/py/pw_tokenizer/database.py

@@ -27,7 +27,7 @@
 import re
 import struct
 import sys
-from typing import Callable, Dict, Iterable, List, Set
+from typing import Callable, Dict, Iterable, List, Pattern, Set, Tuple
 
 try:
     from pw_tokenizer import elf_reader, tokens
@@ -143,7 +143,8 @@
     }
 
 
-def _handle_create(databases, database, force, output_type, include, exclude):
+def _handle_create(databases, database, force, output_type, include, exclude,
+                   replace):
     """Creates a token database file from one or more ELF files."""
 
     if database == '-':
@@ -156,7 +157,7 @@
         fd = open(database, 'wb')
 
     database = tokens.Database.merged(*databases)
-    database.filter(include, exclude)
+    database.filter(include, exclude, replace)
 
     with fd:
         if output_type == 'csv':
@@ -366,18 +367,47 @@
         '-i',
         '--include',
         type=re.compile,
+        default=[],
         action='append',
-        help=(
-            'If provided, at least one of these regular expressions must match '
-            'for a string to be included in the database.'))
+        help=('If provided, at least one of these regular expressions must '
+              'match for a string to be included in the database.'))
     subparser.add_argument(
         '-e',
         '--exclude',
         type=re.compile,
+        default=[],
         action='append',
         help=('If provided, none of these regular expressions may match for a '
               'string to be included in the database.'))
 
+    unescaped_slash = re.compile(r'(?<!\\)/')
+
+    def replacement(value: str) -> Tuple[Pattern, 'str']:
+        try:
+            find, sub = unescaped_slash.split(value, 1)
+        except ValueError as err:
+            raise argparse.ArgumentTypeError(
+                'replacements must be specified as "search_regex/replacement"')
+
+        try:
+            return re.compile(find.replace(r'\/', '/')), sub
+        except re.error as err:
+            raise argparse.ArgumentTypeError(
+                f'"{value}" is not a valid regular expression: {err}')
+
+    subparser.add_argument(
+        '--replace',
+        type=replacement,
+        default=[],
+        action='append',
+        help=('If provided, replaces text that matches a regular expression. '
+              'This can be used to replace sensitive terms in a token '
+              'database that will be distributed publicly. The expression and '
+              'replacement are specified as "search_regex/replacement". '
+              'Plain slash characters in the regex must be escaped with a '
+              r'backslash (\/). The replacement text may include '
+              'backreferences for captured groups in the regex.'))
+
     # The 'add' command adds strings to a database from a set of ELFs.
     subparser = subparsers.add_parser(
         'add',

diff --git a/pw_tokenizer/py/pw_tokenizer/tokens.py b/pw_tokenizer/py/pw_tokenizer/tokens.py
index 685d074..15d5c13 100644
--- a/pw_tokenizer/py/pw_tokenizer/tokens.py
+++ b/pw_tokenizer/py/pw_tokenizer/tokens.py

@@ -22,7 +22,7 @@
 import re
 import struct
 from typing import BinaryIO, Callable, Dict, Iterable, List, NamedTuple
-from typing import Optional, Tuple, Union, ValuesView
+from typing import Optional, Pattern, Tuple, Union, ValuesView
 
 DATE_FORMAT = '%Y-%m-%d'
 
@@ -242,12 +242,18 @@
                 else:
                     self._database[key] = entry
 
-    def filter(self, include: Iterable = (), exclude: Iterable = ()) -> None:
+    def filter(
+        self,
+        include: Iterable[Union[str, Pattern[str]]] = (),
+        exclude: Iterable[Union[str, Pattern[str]]] = (),
+        replace: Iterable[Tuple[Union[str, Pattern[str]], str]] = ()
+    ) -> None:
         """Filters the database using regular expressions (strings or compiled).
 
     Args:
       include: iterable of regexes; only entries matching at least one are kept
       exclude: iterable of regexes; entries matching any of these are removed
+      replace: iterable of (regex, str); replaces matching terms in all entries
     """
         self._cache = None
 
@@ -261,13 +267,18 @@
 
         if exclude:
             exclude_re = [re.compile(pattern) for pattern in exclude]
-            to_delete.extend(key for key, val in self._database.items()  #
-                             if any(
-                                 rgx.search(val.string) for rgx in exclude_re))
+            to_delete.extend(key for key, val in self._database.items() if any(
+                rgx.search(val.string) for rgx in exclude_re))
 
         for key in to_delete:
             del self._database[key]
 
+        for search, replacement in replace:
+            search = re.compile(search)
+
+            for value in self._database.values():
+                value.string = search.sub(replacement, value.string)
+
     def __len__(self) -> int:
         """Returns the number of entries in the database."""
         return len(self.entries())
commit	7d9cc974fbd14557e171e15e81201757334bf99a	[log] [tgz]
author	Wyatt Hepler <hepler@google.com>	Wed Apr 08 16:55:47 2020 -0700
committer	CQ Bot Account <commit-bot@chromium.org>	Thu Oct 01 16:20:04 2020 +0000
tree	c3caf7410f7a1ac873bb602274a5b197900c5139
parent	140c6ef4132208d8d115e9835645bbc52e33e111 [diff]