pw_tokenizer: Store tokens along with strings

- Instead of storing plain tokenized strings in the ELF, store tokenized
  string entries with the token, string, and domain.
- Update the C++ tokenization code to remove length limitations. Since
  the tokens are stored with the strings, the token never has to be
  recalculated and there is no need for consistency between C++ and C
  (which can only hash a fixed number of characters).
- Use a better unique name (__LINE__ and __COUNTER__) for tokenized
  variables and sections.
- Change the default domain from "default" to "". This is a more obvious
  default and takes less space in the ELF.

Change-Id: I74fba3be55c0df67a71ef22143fe4916803aa796
Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/21980
Commit-Queue: Wyatt Hepler <hepler@google.com>
Reviewed-by: Keir Mierle <keir@google.com>
diff --git a/pw_tokenizer/tokenize_test.cc b/pw_tokenizer/tokenize_test.cc
index 6a64c1a..01c6bc5 100644
--- a/pw_tokenizer/tokenize_test.cc
+++ b/pw_tokenizer/tokenize_test.cc
@@ -20,29 +20,17 @@
 #include <iterator>
 
 #include "gtest/gtest.h"
-#include "pw_tokenizer/pw_tokenizer_65599_fixed_length_hash.h"
+#include "pw_tokenizer/hash.h"
 #include "pw_tokenizer_private/tokenize_test.h"
 #include "pw_varint/varint.h"
 
 namespace pw::tokenizer {
 namespace {
 
-// The hash to use for this test. This makes sure the strings are shorter than
-// the configured max length to ensure this test works with any reasonable
-// configuration.
-template <size_t kSize>
-constexpr uint32_t TestHash(const char (&string)[kSize]) {
-  constexpr unsigned kTestHashLength = 64;
-  static_assert(kTestHashLength <= PW_TOKENIZER_CFG_HASH_LENGTH);
-  static_assert(kSize <= kTestHashLength + 1);
-  return PwTokenizer65599FixedLengthHash(std::string_view(string, kSize - 1),
-                                         kTestHashLength);
-}
-
 // Constructs an array with the hashed string followed by the provided bytes.
 template <uint8_t... kData, size_t kSize>
 constexpr auto ExpectedData(const char (&format)[kSize]) {
-  const uint32_t value = TestHash(format);
+  const uint32_t value = Hash(format);
   return std::array<uint8_t, sizeof(uint32_t) + sizeof...(kData)>{
       static_cast<uint8_t>(value & 0xff),
       static_cast<uint8_t>(value >> 8 & 0xff),
@@ -58,23 +46,23 @@
 
 TEST(TokenizeString, String_MatchesHash) {
   constexpr uint32_t token = PW_TOKENIZE_STRING("[:-)");
-  EXPECT_EQ(TestHash("[:-)"), token);
+  EXPECT_EQ(Hash("[:-)"), token);
 }
 
 constexpr uint32_t kGlobalToken = PW_TOKENIZE_STRING(">:-[]");
 
 TEST(TokenizeString, GlobalVariable_MatchesHash) {
-  EXPECT_EQ(TestHash(">:-[]"), kGlobalToken);
+  EXPECT_EQ(Hash(">:-[]"), kGlobalToken);
 }
 
 struct TokenizedWithinClass {
   static constexpr uint32_t kThisToken = PW_TOKENIZE_STRING("???");
 };
 
-static_assert(TestHash("???") == TokenizedWithinClass::kThisToken);
+static_assert(Hash("???") == TokenizedWithinClass::kThisToken);
 
 TEST(TokenizeString, ClassMember_MatchesHash) {
-  EXPECT_EQ(TestHash("???"), TokenizedWithinClass().kThisToken);
+  EXPECT_EQ(Hash("???"), TokenizedWithinClass().kThisToken);
 }
 
 // Use a function with a shorter name to test tokenizing __func__ and
@@ -89,11 +77,11 @@
 //
 void TestName() {
   constexpr uint32_t function_hash = PW_TOKENIZE_STRING(__func__);
-  EXPECT_EQ(pw::tokenizer::TestHash(__func__), function_hash);
+  EXPECT_EQ(pw::tokenizer::Hash(__func__), function_hash);
 
   // Check the non-standard __PRETTY_FUNCTION__ name.
   constexpr uint32_t pretty_function = PW_TOKENIZE_STRING(__PRETTY_FUNCTION__);
-  EXPECT_EQ(pw::tokenizer::TestHash(__PRETTY_FUNCTION__), pretty_function);
+  EXPECT_EQ(pw::tokenizer::Hash(__PRETTY_FUNCTION__), pretty_function);
 }
 
 TEST(TokenizeString, FunctionName) { TestName(); }
@@ -102,17 +90,32 @@
   constexpr char array[] = "won-won-won-wonderful";
 
   const uint32_t array_hash = PW_TOKENIZE_STRING(array);
-  EXPECT_EQ(TestHash(array), array_hash);
+  EXPECT_EQ(Hash(array), array_hash);
+}
+
+TEST(TokenizeString, NullInString) {
+  // Use PW_TOKENIZER_STRING_TOKEN to avoid emitting strings with NUL into the
+  // ELF file. The CSV database format does not support NUL.
+  constexpr char nulls[32] = {};
+  static_assert(Hash(nulls) == PW_TOKENIZER_STRING_TOKEN(nulls));
+  static_assert(PW_TOKENIZER_STRING_TOKEN(nulls) != 0u);
+
+  static_assert(PW_TOKENIZER_STRING_TOKEN("\0") == Hash("\0"));
+  static_assert(PW_TOKENIZER_STRING_TOKEN("\0") != Hash(""));
+
+  static_assert(PW_TOKENIZER_STRING_TOKEN("abc\0def") == Hash("abc\0def"));
+
+  static_assert(Hash("abc\0def") != Hash("abc\0def\0"));
 }
 
 // Verify that we can tokenize multiple strings from one source line.
-#define THREE_FOR_ONE(first, second, third)         \
-  [[maybe_unused]] constexpr uint32_t token_1 =     \
-      PW_TOKENIZE_STRING_DOMAIN("ignored", first);  \
-  [[maybe_unused]] constexpr uint32_t token_2 =     \
-      PW_TOKENIZE_STRING_DOMAIN("ignored", second); \
-  [[maybe_unused]] constexpr uint32_t token_3 =     \
-      PW_TOKENIZE_STRING_DOMAIN("ignored", third);
+#define THREE_FOR_ONE(first, second, third)             \
+  [[maybe_unused]] constexpr uint32_t token_1 =         \
+      PW_TOKENIZE_STRING_DOMAIN("TEST_DOMAIN", first);  \
+  [[maybe_unused]] constexpr uint32_t token_2 =         \
+      PW_TOKENIZE_STRING_DOMAIN("TEST_DOMAIN", second); \
+  [[maybe_unused]] constexpr uint32_t token_3 =         \
+      PW_TOKENIZE_STRING_DOMAIN("TEST_DOMAIN", third);
 
 TEST(TokenizeString, MultipleTokenizationsInOneMacroExpansion) {
   // This verifies that we can safely tokenize multiple times in a single macro
@@ -479,11 +482,10 @@
   EXPECT_EQ(std::memcmp(expected.data(), message_, expected.size()), 0);
 }
 
-// Hijack the PW_TOKENIZE_STRING_DOMAIN macro to capture the domain name.
-#undef PW_TOKENIZE_STRING_DOMAIN
-#define PW_TOKENIZE_STRING_DOMAIN(domain, string)                 \
-  /* assigned to a variable */ PW_TOKENIZER_STRING_TOKEN(string); \
-  tokenizer_domain = domain;                                      \
+// Hijack an internal macro to capture the tokenizer domain.
+#undef _PW_TOKENIZER_RECORD_ORIGINAL_STRING
+#define _PW_TOKENIZER_RECORD_ORIGINAL_STRING(token, domain, string) \
+  tokenizer_domain = domain;                                        \
   string_literal = string
 
 TEST_F(TokenizeToBuffer, Domain_Default) {