perf: Optimize unordered_map usage in thread_safety.h

Wrap std::unordered_map in a 'small_unordered_map' class that has inline storage
for the first element of the map. This avoids memory allocation and hashing in the
common case where there is only zero or one entries in the map at a time, which
happens a lot given that entries only stay in the map for the duration of an entry
point.
diff --git a/scripts/thread_safety_generator.py b/scripts/thread_safety_generator.py
index 4cb3288..c38d3d6 100644
--- a/scripts/thread_safety_generator.py
+++ b/scripts/thread_safety_generator.py
@@ -191,13 +191,63 @@
     int writer_count;
 };
 
+// This is a wrapper around unordered_map that optimizes for the common case
+// of only containing a single element. The "first" element's use is stored
+// inline in the class and doesn't require hashing or memory (de)allocation.
+// TODO: Consider generalizing this from one element to N elements (where N
+// is a template parameter).
+template <typename Key, typename T>
+class small_unordered_map {
+
+    bool first_data_allocated;
+    Key first_data_key;
+    T first_data;
+
+    std::unordered_map<Key, T> uses;
+
+public:
+    small_unordered_map() : first_data_allocated(false) {}
+
+    bool contains(const Key& object) const {
+        if (first_data_allocated && object == first_data_key) {
+            return true;
+        // check size() first to avoid hashing object unnecessarily.
+        } else if (uses.size() == 0) {
+            return false;
+        } else {
+            return uses.find(object) != uses.end();
+        }
+    }
+
+    T& operator[](const Key& object) {
+        if (first_data_allocated && first_data_key == object) {
+            return first_data;
+        } else if (!first_data_allocated && uses.size() == 0) {
+            first_data_allocated = true;
+            first_data_key = object;
+            return first_data;
+        } else {
+            return uses[object];
+        }
+    }
+
+    typename std::unordered_map<Key, T>::size_type erase(const Key& object) {
+        if (first_data_allocated && first_data_key == object) {
+            first_data_allocated = false;
+            return 1;
+        } else {
+            return uses.erase(object);
+        }
+    }
+};
+
 template <typename T>
 class counter {
 public:
     const char *typeName;
     VkDebugReportObjectTypeEXT objectType;
     debug_report_data **report_data;
-    std::unordered_map<T, object_use_data> uses;
+    small_unordered_map<T, object_use_data> uses;
     std::mutex counter_lock;
     std::condition_variable counter_condition;
 
@@ -209,7 +259,7 @@
         bool skip = false;
         loader_platform_thread_id tid = loader_platform_get_thread_id();
         std::unique_lock<std::mutex> lock(counter_lock);
-        if (uses.find(object) == uses.end()) {
+        if (!uses.contains(object)) {
             // There is no current use of the object.  Record writer thread.
             struct object_use_data *use_data = &uses[object];
             use_data->reader_count = 0;
@@ -227,7 +277,7 @@
                         typeName, (uint64_t)use_data->thread, (uint64_t)tid);
                     if (skip) {
                         // Wait for thread-safe access to object instead of skipping call.
-                        while (uses.find(object) != uses.end()) {
+                        while (uses.contains(object)) {
                             counter_condition.wait(lock);
                         }
                         // There is now no current use of the object.  Record writer thread.
@@ -255,7 +305,7 @@
                         typeName, (uint64_t)use_data->thread, (uint64_t)tid);
                     if (skip) {
                         // Wait for thread-safe access to object instead of skipping call.
-                        while (uses.find(object) != uses.end()) {
+                        while (uses.contains(object)) {
                             counter_condition.wait(lock);
                         }
                         // There is now no current use of the object.  Record writer thread.
@@ -299,7 +349,7 @@
         bool skip = false;
         loader_platform_thread_id tid = loader_platform_get_thread_id();
         std::unique_lock<std::mutex> lock(counter_lock);
-        if (uses.find(object) == uses.end()) {
+        if (!uses.contains(object)) {
             // There is no current use of the object.  Record reader count
             struct object_use_data *use_data = &uses[object];
             use_data->reader_count = 1;
@@ -314,7 +364,7 @@
                 typeName, (uint64_t)uses[object].thread, (uint64_t)tid);
             if (skip) {
                 // Wait for thread-safe access to object instead of skipping call.
-                while (uses.find(object) != uses.end()) {
+                while (uses.contains(object)) {
                     counter_condition.wait(lock);
                 }
                 // There is no current use of the object.  Record reader count