pw_tokenizer: Improve database parsing error messages - Improve the error messages from failing to parse a token database. - Add tests for the DatabaseFile class. Change-Id: Ic39c435bebd906d56572d3501309cb32fa8bd128 Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/23001 Reviewed-by: Anthony DiGirolamo <tonymd@google.com> Commit-Queue: Wyatt Hepler <hepler@google.com>

commit: 0b485fbaae91a601d3c743204769c959e03f6ae1 [log] [tgz]
author: Wyatt Hepler <hepler@google.com> Thu Oct 29 17:51:22 2020 -0700
committer: CQ Bot Account <pigweed-scoped@luci-project-accounts.iam.gserviceaccount.com> Tue Nov 03 01:41:41 2020 +0000
tree: fb1f75d48b8a00465b99713534da2905e3e5e516
parent: f1f4110493cfd99ce0ebbb4d54bb824b65948fd3 [diff]
diff --git a/pw_tokenizer/py/pw_tokenizer/database.py b/pw_tokenizer/py/pw_tokenizer/database.py
index 3bc3310..2c879ae 100755
--- a/pw_tokenizer/py/pw_tokenizer/database.py
+++ b/pw_tokenizer/py/pw_tokenizer/database.py

@@ -373,13 +373,18 @@
                     paths.update(expand_paths_or_globs(value))
 
             for path in paths:
-                try:
-                    databases.append(load_token_database(path))
-                except:
-                    _LOG.exception('Failed to load token database %s', path)
-                    raise
-        except (FileNotFoundError, ValueError) as err:
+                databases.append(load_token_database(path))
+        except tokens.DatabaseFormatError as err:
+            parser.error(
+                f'argument elf_or_token_database: {path} is not a supported '
+                'token database file. Only ELF files or token databases (CSV '
+                f'or binary format) are supported. {err}. ')
+        except FileNotFoundError as err:
             parser.error(f'argument elf_or_token_database: {err}')
+        except:  # pylint: disable=bare-except
+            _LOG.exception('Failed to load token database %s', path)
+            parser.error('argument elf_or_token_database: '
+                         f'Error occurred while loading token database {path}')
 
         setattr(namespace, self.dest, databases)
 

diff --git a/pw_tokenizer/py/pw_tokenizer/tokens.py b/pw_tokenizer/py/pw_tokenizer/tokens.py
index a0c9b66..570b021 100644
--- a/pw_tokenizer/py/pw_tokenizer/tokens.py
+++ b/pw_tokenizer/py/pw_tokenizer/tokens.py

@@ -345,6 +345,10 @@
 BINARY_FORMAT = _BinaryFileFormat()
 
 
+class DatabaseFormatError(Exception):
+    """Failed to parse a token database file."""
+
+
 def file_is_binary_database(fd: BinaryIO) -> bool:
     """True if the file starts with the binary token database magic string."""
     try:
@@ -356,15 +360,37 @@
         return False
 
 
+def _check_that_file_is_csv_database(path: Path) -> None:
+    """Raises an error unless the path appears to be a CSV token database."""
+    try:
+        with path.open('rb') as fd:
+            data = fd.read(8)  # Read 8 bytes, which should be the first token.
+
+        if not data:
+            return  # File is empty, which is valid CSV.
+
+        if len(data) != 8:
+            raise DatabaseFormatError(
+                f'Attempted to read {path} as a CSV token database, but the '
+                f'file is too short ({len(data)} B)')
+
+        # Make sure the first 8 chars are a valid hexadecimal number.
+        _ = int(data.decode(), 16)
+    except (IOError, UnicodeDecodeError, ValueError) as err:
+        raise DatabaseFormatError(
+            f'Encountered error while reading {path} as a CSV token database'
+        ) from err
+
+
 def parse_binary(fd: BinaryIO) -> Iterable[TokenizedStringEntry]:
     """Parses TokenizedStringEntries from a binary token database file."""
     magic, entry_count = BINARY_FORMAT.header.unpack(
         fd.read(BINARY_FORMAT.header.size))
 
     if magic != BINARY_FORMAT.magic:
-        raise ValueError(
-            'Magic number mismatch (found {!r}, expected {!r})'.format(
-                magic, BINARY_FORMAT.magic))
+        raise DatabaseFormatError(
+            f'Binary token database magic number mismatch (found {magic!r}, '
+            f'expected {BINARY_FORMAT.magic!r}) while reading from {fd}')
 
     entries = []
 
@@ -441,6 +467,7 @@
                 return
 
         # Read the path as a CSV file.
+        _check_that_file_is_csv_database(self.path)
         with self.path.open('r', newline='') as file:
             super().__init__(parse_csv(file))
             self._export = write_csv

diff --git a/pw_tokenizer/py/tokens_test.py b/pw_tokenizer/py/tokens_test.py
index 0154ca7..58014e2 100755
--- a/pw_tokenizer/py/tokens_test.py
+++ b/pw_tokenizer/py/tokens_test.py

@@ -17,6 +17,8 @@
 import datetime
 import io
 import logging
+from pathlib import Path
+import tempfile
 import unittest
 
 from pw_tokenizer import tokens
@@ -87,7 +89,7 @@
 """
 
 
-def read_db_from_csv(csv_str):
+def read_db_from_csv(csv_str: str) -> tokens.Database:
     with io.StringIO(csv_str) as csv_db:
         return tokens.Database(tokens.parse_csv(csv_db))
 
@@ -385,6 +387,47 @@
         self.assertEqual(str(db), CSV_DATABASE)
 
 
+class TestDatabaseFile(unittest.TestCase):
+    """Tests the DatabaseFile class."""
+    def setUp(self):
+        file = tempfile.NamedTemporaryFile(delete=False)
+        file.close()
+        self._path = Path(file.name)
+
+    def tearDown(self):
+        self._path.unlink()
+
+    def test_update_csv_file(self):
+        self._path.write_text(CSV_DATABASE)
+        db = tokens.DatabaseFile(self._path)
+        self.assertEqual(str(db), CSV_DATABASE)
+
+        db.add([tokens.TokenizedStringEntry(0xffffffff, 'New entry!')])
+
+        db.write_to_file()
+
+        self.assertEqual(self._path.read_text(),
+                         CSV_DATABASE + 'ffffffff,          ,"New entry!"\n')
+
+    def test_csv_file_too_short_raises_exception(self):
+        self._path.write_text('1234')
+
+        with self.assertRaises(tokens.DatabaseFormatError):
+            tokens.DatabaseFile(self._path)
+
+    def test_csv_invalid_format_raises_exception(self):
+        self._path.write_text('MK34567890')
+
+        with self.assertRaises(tokens.DatabaseFormatError):
+            tokens.DatabaseFile(self._path)
+
+    def test_csv_not_utf8(self):
+        self._path.write_bytes(b'\x80' * 20)
+
+        with self.assertRaises(tokens.DatabaseFormatError):
+            tokens.DatabaseFile(self._path)
+
+
 class TestFilter(unittest.TestCase):
     """Tests the filtering functionality."""
     def setUp(self):
commit	0b485fbaae91a601d3c743204769c959e03f6ae1	[log] [tgz]
author	Wyatt Hepler <hepler@google.com>	Thu Oct 29 17:51:22 2020 -0700
committer	CQ Bot Account <pigweed-scoped@luci-project-accounts.iam.gserviceaccount.com>	Tue Nov 03 01:41:41 2020 +0000
tree	fb1f75d48b8a00465b99713534da2905e3e5e516
parent	f1f4110493cfd99ce0ebbb4d54bb824b65948fd3 [diff]