layers: faster threading layer for single thread

Add a check to the threading layer to detect a single thread case.
If the application has only called vulkan from one thread at a time,
then skip access counters for externally synchronized parameters.
This greatly reduces the overhead of the layer for applications
that don't truly use vulkan in a multi-threaded way.
diff --git a/layers/threading.cpp b/layers/threading.cpp
index d98aa1c..ed7663a 100644
--- a/layers/threading.cpp
+++ b/layers/threading.cpp
@@ -94,9 +94,16 @@
         }
     }
 
-    startWriteObject(my_data, instance);
+    bool threadChecks = startMultiThread();
+    if (threadChecks) {
+        startWriteObject(my_data, instance);
+    }
     pTable->DestroyInstance(instance, pAllocator);
-    finishWriteObject(my_data, instance);
+    if (threadChecks) {
+        finishWriteObject(my_data, instance);
+    } else {
+        finishMultiThread();
+    }
 
     // Disable and cleanup the temporary callback(s):
     if (callback_setup) {
@@ -153,9 +160,16 @@
 VKAPI_ATTR void VKAPI_CALL DestroyDevice(VkDevice device, const VkAllocationCallbacks *pAllocator) {
     dispatch_key key = get_dispatch_key(device);
     layer_data *dev_data = get_my_data_ptr(key, layer_data_map);
-    startWriteObject(dev_data, device);
+    bool threadChecks = startMultiThread();
+    if (threadChecks) {
+        startWriteObject(dev_data, device);
+    }
     dev_data->device_dispatch_table->DestroyDevice(device, pAllocator);
-    finishWriteObject(dev_data, device);
+    if (threadChecks) {
+        finishWriteObject(dev_data, device);
+    } else {
+        finishMultiThread();
+    }
     layer_data_map.erase(key);
 }
 
@@ -281,25 +295,39 @@
 CreateDebugReportCallbackEXT(VkInstance instance, const VkDebugReportCallbackCreateInfoEXT *pCreateInfo,
                              const VkAllocationCallbacks *pAllocator, VkDebugReportCallbackEXT *pMsgCallback) {
     layer_data *my_data = get_my_data_ptr(get_dispatch_key(instance), layer_data_map);
-    startReadObject(my_data, instance);
+    bool threadChecks = startMultiThread();
+    if (threadChecks) {
+        startReadObject(my_data, instance);
+    }
     VkResult result =
         my_data->instance_dispatch_table->CreateDebugReportCallbackEXT(instance, pCreateInfo, pAllocator, pMsgCallback);
     if (VK_SUCCESS == result) {
         result = layer_create_msg_callback(my_data->report_data, false, pCreateInfo, pAllocator, pMsgCallback);
     }
-    finishReadObject(my_data, instance);
+    if (threadChecks) {
+        finishReadObject(my_data, instance);
+    } else {
+        finishMultiThread();
+    }
     return result;
 }
 
 VKAPI_ATTR void VKAPI_CALL
 DestroyDebugReportCallbackEXT(VkInstance instance, VkDebugReportCallbackEXT callback, const VkAllocationCallbacks *pAllocator) {
     layer_data *my_data = get_my_data_ptr(get_dispatch_key(instance), layer_data_map);
-    startReadObject(my_data, instance);
-    startWriteObject(my_data, callback);
+    bool threadChecks = startMultiThread();
+    if (threadChecks) {
+        startReadObject(my_data, instance);
+        startWriteObject(my_data, callback);
+    }
     my_data->instance_dispatch_table->DestroyDebugReportCallbackEXT(instance, callback, pAllocator);
     layer_destroy_msg_callback(my_data->report_data, callback, pAllocator);
-    finishReadObject(my_data, instance);
-    finishWriteObject(my_data, callback);
+    if (threadChecks) {
+        finishReadObject(my_data, instance);
+        finishWriteObject(my_data, callback);
+    } else {
+        finishMultiThread();
+    }
 }
 
 VKAPI_ATTR VkResult VKAPI_CALL
@@ -308,12 +336,19 @@
     layer_data *my_data = get_my_data_ptr(key, layer_data_map);
     VkLayerDispatchTable *pTable = my_data->device_dispatch_table;
     VkResult result;
-    startReadObject(my_data, device);
-    startWriteObject(my_data, pAllocateInfo->commandPool);
+    bool threadChecks = startMultiThread();
+    if (threadChecks) {
+        startReadObject(my_data, device);
+        startWriteObject(my_data, pAllocateInfo->commandPool);
+    }
 
     result = pTable->AllocateCommandBuffers(device, pAllocateInfo, pCommandBuffers);
-    finishReadObject(my_data, device);
-    finishWriteObject(my_data, pAllocateInfo->commandPool);
+    if (threadChecks) {
+        finishReadObject(my_data, device);
+        finishWriteObject(my_data, pAllocateInfo->commandPool);
+    } else {
+        finishMultiThread();
+    }
 
     // Record mapping from command buffer to command pool
     if (VK_SUCCESS == result) {
@@ -332,19 +367,26 @@
     layer_data *my_data = get_my_data_ptr(key, layer_data_map);
     VkLayerDispatchTable *pTable = my_data->device_dispatch_table;
     const bool lockCommandPool = false; // pool is already directly locked
-    startReadObject(my_data, device);
-    startWriteObject(my_data, commandPool);
-    for (uint32_t index = 0; index < commandBufferCount; index++) {
-        startWriteObject(my_data, pCommandBuffers[index], lockCommandPool);
+    bool threadChecks = startMultiThread();
+    if (threadChecks) {
+        startReadObject(my_data, device);
+        startWriteObject(my_data, commandPool);
+        for (uint32_t index = 0; index < commandBufferCount; index++) {
+            startWriteObject(my_data, pCommandBuffers[index], lockCommandPool);
+        }
     }
 
     pTable->FreeCommandBuffers(device, commandPool, commandBufferCount, pCommandBuffers);
-    finishReadObject(my_data, device);
-    finishWriteObject(my_data, commandPool);
-    for (uint32_t index = 0; index < commandBufferCount; index++) {
-        finishWriteObject(my_data, pCommandBuffers[index], lockCommandPool);
-        std::lock_guard<std::mutex> lock(command_pool_lock);
-        command_pool_map.erase(pCommandBuffers[index]);
+    if (threadChecks) {
+        finishReadObject(my_data, device);
+        finishWriteObject(my_data, commandPool);
+        for (uint32_t index = 0; index < commandBufferCount; index++) {
+            finishWriteObject(my_data, pCommandBuffers[index], lockCommandPool);
+            std::lock_guard<std::mutex> lock(command_pool_lock);
+            command_pool_map.erase(pCommandBuffers[index]);
+        }
+    } else {
+        finishMultiThread();
     }
 }
 
diff --git a/layers/threading.h b/layers/threading.h
index 311ce7b..1d0924d 100644
--- a/layers/threading.h
+++ b/layers/threading.h
@@ -48,6 +48,26 @@
 
 struct layer_data;
 
+namespace threading {
+volatile bool vulkan_in_use = false;
+volatile bool vulkan_multi_threaded = false;
+// starting check if an application is using vulkan from multiple threads.
+inline bool startMultiThread() {
+    if (vulkan_multi_threaded) {
+        return true;
+    }
+    if (vulkan_in_use) {
+        vulkan_multi_threaded = true;
+        return true;
+    }
+    vulkan_in_use = true;
+    return false;
+}
+
+// finishing check if an application is using vulkan from multiple threads.
+inline void finishMultiThread() { vulkan_in_use = false; }
+} // namespace threading
+
 template <typename T> class counter {
   public:
     const char *typeName;