pw_tokenizer: Add token_database_fuzzer This CL adds a fuzz target for fuzzing the TokenDatabase API. A database is generated from the fuzz data, and a random entry count is set. Then we try to find a random token, and iterate over the database to make sure there are no crashes Change-Id: If9ab79e042dadb9fdf371ddb11fed1932184d7bb

commit: 2ee244b58ea9ebd60a529f7db882d2b594600fa4 [log] [tgz]
author: karthik bharadwaj <karthikmb@google.com> Thu Apr 16 14:08:22 2020 -0700
committer: CQ Bot Account <commit-bot@chromium.org> Sat Apr 25 01:10:45 2020 +0000
tree: bc875b6e6aac77701b5e46a434df04fa0f5d9961
parent: d2abace80c3d22af3461807cecf80e8652c22919 [diff]
diff --git a/pw_tokenizer/BUILD.gn b/pw_tokenizer/BUILD.gn
index ba21bb4..b4663ff 100644
--- a/pw_tokenizer/BUILD.gn
+++ b/pw_tokenizer/BUILD.gn

@@ -117,6 +117,7 @@
     ":simple_tokenize_test_cpp11",
     ":simple_tokenize_test_cpp14",
     ":simple_tokenize_test_cpp17",
+    ":token_database_fuzzer",
     ":token_database_test",
     ":tokenize_test",
   ]
@@ -226,6 +227,16 @@
   ]
 }
 
+pw_fuzzer("token_database_fuzzer") {
+  sources = [ "token_database_fuzzer.cc" ]
+  deps = [
+    ":decoder",
+    "$dir_pw_fuzzer",
+    "$dir_pw_preprocessor",
+    "$dir_pw_span",
+  ]
+}
+
 pw_fuzzer("detokenize_fuzzer") {
   sources = [ "detokenize_fuzzer.cc" ]
   deps = [

diff --git a/pw_tokenizer/token_database_fuzzer.cc b/pw_tokenizer/token_database_fuzzer.cc
new file mode 100644
index 0000000..2013387
--- /dev/null
+++ b/pw_tokenizer/token_database_fuzzer.cc

@@ -0,0 +1,131 @@
+// Copyright 2020 The Pigweed Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy of
+// the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+// This file implements a basic fuzz test for the TokenDatabase class
+// A database is created from fuzz data, and a random entry count (also
+// derived from the fuzz data) is set. We then run iterations and 'find'
+// operations on this database.
+
+#include <cstring>
+
+#include "pw_fuzzer/asan_interface.h"
+#include "pw_fuzzer/fuzzed_data_provider.h"
+#include "pw_preprocessor/util.h"
+#include "pw_span/span.h"
+#include "pw_tokenizer/token_database.h"
+
+namespace pw::tokenizer {
+namespace {
+
+enum FuzzTestType : uint8_t {
+  kValidHeader,
+  kRandomHeader,
+  kMaxValue = kRandomHeader,
+};
+
+constexpr size_t kTokenHeaderSize = 16;
+
+// The default max length in bytes of fuzzed data provided. Note that
+// this needs to change if the fuzzer executable is run with a
+// '-max_len' argument.
+constexpr size_t kFuzzDataSizeMax = 4096;
+
+// Location of the 'EntryCount' field in the token header.
+constexpr size_t kEntryCountOffset = 8;
+constexpr size_t kEntryCountSize = 4;
+
+void SetTokenEntryCountInBuffer(uint8_t* buffer, uint32_t count) {
+  memcpy(buffer + kEntryCountOffset, &count, kEntryCountSize);
+}
+
+void IterateOverDatabase(TokenDatabase* const database) {
+  for (TokenDatabase::Entry entry : *database) {
+    // Since we don't "use" the contents of the entry, we exercise
+    // the entry by extracting its contents into volatile variables
+    // to prevent it from being optimized out during compilation.
+    volatile const char* entry_string = entry.string;
+    volatile uint32_t entry_token = entry.token;
+    PW_UNUSED(entry_string);
+    PW_UNUSED(entry_token);
+  }
+}
+}  // namespace
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  constexpr size_t kBufferSizeMax = kFuzzDataSizeMax + kTokenHeaderSize;
+  constexpr char kDefaultHeader[] = "TOKENS\0\0\0\0\0\0\0\0\0";
+  static uint8_t buffer[kBufferSizeMax];
+
+  if (size > kFuzzDataSizeMax) {
+    return 0;
+  }
+
+  FuzzedDataProvider provider(data, size);
+
+  // Initialize the token header with either a valid or invalid header
+  // based on a random enum consumed from the fuzz data.
+  switch (provider.ConsumeEnum<FuzzTestType>()) {
+    case kValidHeader:
+      memcpy(buffer, kDefaultHeader, kTokenHeaderSize);
+      break;
+
+    case kRandomHeader: {
+      std::vector<uint8_t> random_header =
+          provider.ConsumeBytes<uint8_t>(kTokenHeaderSize);
+      random_header.resize(kTokenHeaderSize);
+      memcpy(buffer, &random_header[0], kTokenHeaderSize);
+      break;
+    }
+  }
+
+  // Consume a 'test token' integer to look up later in the database.
+  uint32_t random_token = provider.ConsumeIntegral<uint32_t>();
+
+  // Consume a 'token count' integer to set as our database entry count.
+  uint32_t random_token_count =
+      provider.ConsumeIntegralInRange<uint32_t>(0, kFuzzDataSizeMax);
+
+  // Consume the remaining data. Note that the data corresponding to the
+  // string entries in the database are not explicitly null-terminated.
+  size_t data_bytes_consumed = provider.ConsumeData(buffer + kTokenHeaderSize,
+                                                    provider.remaining_bytes());
+
+  SetTokenEntryCountInBuffer(buffer, random_token_count);
+
+  // Poison the unused buffer space for this run of the fuzzer to
+  // prevent the token database creator from reading too far in.
+  size_t data_size = kTokenHeaderSize + data_bytes_consumed;
+  size_t poisoned_length = kBufferSizeMax - data_size;
+  void* poisoned = &buffer[data_size];
+
+  ASAN_POISON_MEMORY_REGION(poisoned, poisoned_length);
+
+  // We create a database from a span of the buffer since the string
+  // entries might not be null terminated, and the creation of a database
+  // from a raw buffer has an explicit null terminated string requirement
+  // specified in the API.
+  span<uint8_t> data_span(buffer, data_size);
+  auto token_database = TokenDatabase::Create<span<uint8_t>>(data_span);
+  volatile auto match = token_database.Find(random_token);
+  PW_UNUSED(match);
+
+  IterateOverDatabase(&token_database);
+
+  // Un-poison for the next iteration.
+  ASAN_UNPOISON_MEMORY_REGION(poisoned, poisoned_length);
+
+  return 0;
+}
+
+}  // namespace pw::tokenizer
commit	2ee244b58ea9ebd60a529f7db882d2b594600fa4	[log] [tgz]
author	karthik bharadwaj <karthikmb@google.com>	Thu Apr 16 14:08:22 2020 -0700
committer	CQ Bot Account <commit-bot@chromium.org>	Sat Apr 25 01:10:45 2020 +0000
tree	bc875b6e6aac77701b5e46a434df04fa0f5d9961
parent	d2abace80c3d22af3461807cecf80e8652c22919 [diff]