loader: Move CreateDevice to device table

Discovered an issue where a layer was doing cleanup
in it's DestroyDevice function but the CreateDevice
was never called.
This happened because the extension was only enabled
on the device chain and the device chain doesn't actually
call CreateDevice. That happens on the Instance chain.
Making it so that we can call down the device chain -
which is terminated by the ICD.
We need to know the real device object to construct the
device chain heiarchy and when calling down the device
chain it should end with the ICD doing the actual device
object creation.

This patch fixes the issue by using the
same process as CreateInstance. The loader will call
the ICD's CreateDevice and pass that in the *pDevice
argument. The layers then ignore the PhysicalDevice parameter
and use the *pDevice to access the device chain.
To prevent the ICD from being called twice needed to
stub in a special loader_GetDeviceChainProcAddr to provide
a stub for only CreateDevice as the end of the chain.

integrate review feedback.
diff --git a/include/vkLayer.h b/include/vkLayer.h
index 971f030..3742725 100644
--- a/include/vkLayer.h
+++ b/include/vkLayer.h
@@ -29,6 +29,7 @@
 typedef struct VkLayerDispatchTable_
 {
     PFN_vkGetDeviceProcAddr GetDeviceProcAddr;
+    PFN_vkCreateDevice CreateDevice;
     PFN_vkDestroyDevice DestroyDevice;
     PFN_vkGetDeviceQueue GetDeviceQueue;
     PFN_vkQueueSubmit QueueSubmit;
@@ -144,7 +145,6 @@
     PFN_vkGetPhysicalDeviceFeatures GetPhysicalDeviceFeatures;
     PFN_vkGetPhysicalDeviceFormatInfo GetPhysicalDeviceFormatInfo;
     PFN_vkGetPhysicalDeviceLimits GetPhysicalDeviceLimits;
-    PFN_vkCreateDevice CreateDevice;
     PFN_vkGetPhysicalDeviceProperties GetPhysicalDeviceProperties;
     PFN_vkGetPhysicalDevicePerformance GetPhysicalDevicePerformance;
     PFN_vkGetPhysicalDeviceQueueCount GetPhysicalDeviceQueueCount;
diff --git a/layers/basic.cpp b/layers/basic.cpp
index 990f669..de2755b 100644
--- a/layers/basic.cpp
+++ b/layers/basic.cpp
@@ -95,7 +95,7 @@
 VK_LAYER_EXPORT VkResult VKAPI vkCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo* pCreateInfo, VkDevice* pDevice)
 {
     printf("At start of wrapped vkCreateDevice() call w/ gpu: %p\n", (void*)gpu);
-    VkResult result = instance_dispatch_table(gpu)->CreateDevice(gpu, pCreateInfo, pDevice);
+    VkResult result = device_dispatch_table(*pDevice)->CreateDevice(gpu, pCreateInfo, pDevice);
     printf("Completed wrapped vkCreateDevice() call w/ pDevice, Device %p: %p\n", (void*)pDevice, (void *) *pDevice);
     return result;
 }
@@ -137,6 +137,8 @@
         return (void *) vkGetDeviceProcAddr;
     }
 
+    if (!strcmp("vkCreateDevice", pName))
+        return (void *) vkCreateDevice;
     if (!strcmp("vkDestroyDevice", pName))
         return (void *) vkDestroyDevice;
     if (!strcmp("vkLayerExtension1", pName))
@@ -170,8 +172,6 @@
         return (void*) vkGetGlobalExtensionCount;
     if (!strcmp("vkGetGlobalExtensionProperties", pName))
         return (void*) vkGetGlobalExtensionProperties;
-    if (!strcmp("vkCreateDevice", pName))
-        return (void *) vkCreateDevice;
     else
     {
         if (instance_dispatch_table(instance)->GetInstanceProcAddr == NULL)
diff --git a/layers/draw_state.cpp b/layers/draw_state.cpp
index 0424416..146c641 100644
--- a/layers/draw_state.cpp
+++ b/layers/draw_state.cpp
@@ -1581,8 +1581,8 @@
 
 VK_LAYER_EXPORT VkResult VKAPI vkCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo* pCreateInfo, VkDevice* pDevice)
 {
-    VkLayerInstanceDispatchTable *pInstanceTable = get_dispatch_table(draw_state_instance_table_map, gpu);
-    VkResult result = pInstanceTable->CreateDevice(gpu, pCreateInfo, pDevice);
+    VkLayerDispatchTable *pDeviceTable = get_dispatch_table(draw_state_device_table_map, *pDevice);
+    VkResult result = pDeviceTable->CreateDevice(gpu, pCreateInfo, pDevice);
     if (result == VK_SUCCESS) {
         layer_data *my_instance_data = get_my_data_ptr(get_dispatch_key(gpu), layer_data_map);
         VkLayerDispatchTable *pTable = get_dispatch_table(draw_state_device_table_map, *pDevice);
@@ -2912,6 +2912,8 @@
         initDeviceTable(draw_state_device_table_map, (const VkBaseLayerObject *) dev);
         return (void *) vkGetDeviceProcAddr;
     }
+    if (!strcmp(funcName, "vkCreateDevice"))
+        return (void*) vkCreateDevice;
     if (!strcmp(funcName, "vkDestroyDevice"))
         return (void*) vkDestroyDevice;
     if (!strcmp(funcName, "vkQueueSubmit"))
@@ -3071,8 +3073,6 @@
         return (void *) vkCreateInstance;
     if (!strcmp(funcName, "vkDestroyInstance"))
         return (void *) vkDestroyInstance;
-    if (!strcmp(funcName, "vkCreateDevice"))
-        return (void*) vkCreateDevice;
     if (!strcmp(funcName, "vkGetPhysicalDeviceExtensionCount"))
         return (void*) vkGetPhysicalDeviceExtensionCount;
     if (!strcmp(funcName, "vkGetPhysicalDeviceExtensionProperties"))
diff --git a/layers/image.cpp b/layers/image.cpp
index dab89e9..0282795 100644
--- a/layers/image.cpp
+++ b/layers/image.cpp
@@ -169,7 +169,7 @@
 
 VK_LAYER_EXPORT VkResult VKAPI vkCreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo* pCreateInfo, VkDevice* pDevice)
 {
-    VkLayerInstanceDispatchTable *pTable = get_dispatch_table(image_instance_table_map, physicalDevice);
+    VkLayerDispatchTable *pTable = get_dispatch_table(image_device_table_map, *pDevice);
     VkResult result = pTable->CreateDevice(physicalDevice, pCreateInfo, pDevice);
     if(result == VK_SUCCESS)
     {
@@ -384,6 +384,8 @@
         return (void*) vkGetDeviceProcAddr;
     }
 
+    if (!strcmp(funcName, "vkCreateDevice"))
+        return (void*) vkCreateDevice;
     if (!strcmp(funcName, "vkDestroyDevice"))
         return (void*) vkDestroyDevice;
     if (!strcmp(funcName, "vkCreateImage"))
@@ -418,8 +420,6 @@
         return (void*) vkCreateInstance;
     if (!strcmp(funcName, "vkDestroyInstance"))
         return (void *) vkDestroyInstance;
-    if (!strcmp(funcName, "vkCreateDevice"))
-        return (void*) vkCreateDevice;
     if (!strcmp(funcName, "vkGetPhysicalDeviceExtensionProperties"))
         return (void*) vkGetPhysicalDeviceExtensionProperties;
     if (!strcmp(funcName, "vkGetPhysicalDeviceExtensionCount"))
diff --git a/layers/layers_table.cpp b/layers/layers_table.cpp
index b2edb5c..51baf2e 100644
--- a/layers/layers_table.cpp
+++ b/layers/layers_table.cpp
@@ -34,7 +34,6 @@
 // Map lookup must be thread safe
 VkLayerDispatchTable *device_dispatch_table(VkObject object)
 {
-//    VkLayerDispatchTable *pDisp  = *(VkLayerDispatchTable **) object;
     dispatch_key key = get_dispatch_key(object);
     device_table_map::const_iterator it = tableMap.find((void *) key);
     assert(it != tableMap.end() && "Not able to find device dispatch entry");
@@ -43,7 +42,6 @@
 
 VkLayerInstanceDispatchTable *instance_dispatch_table(VkObject object)
 {
-//    VkLayerInstanceDispatchTable *pDisp = *(VkLayerInstanceDispatchTable **) object;
     dispatch_key key = get_dispatch_key(object);
     instance_table_map::const_iterator it = tableInstanceMap.find((void *) key);
 #if DISPATCH_MAP_DEBUG
diff --git a/layers/mem_tracker.cpp b/layers/mem_tracker.cpp
index 817cb81..789899a 100644
--- a/layers/mem_tracker.cpp
+++ b/layers/mem_tracker.cpp
@@ -889,8 +889,8 @@
     const VkDeviceCreateInfo *pCreateInfo,
     VkDevice                 *pDevice)
 {
-    VkLayerInstanceDispatchTable *pInstanceTable = get_dispatch_table(mem_tracker_instance_table_map, gpu);
-    VkResult result = pInstanceTable->CreateDevice(gpu, pCreateInfo, pDevice);
+    VkLayerDispatchTable *pDeviceTable = get_dispatch_table(mem_tracker_device_table_map, *pDevice);
+    VkResult result = pDeviceTable->CreateDevice(gpu, pCreateInfo, pDevice);
     if (result == VK_SUCCESS) {
         layer_data *my_instance_data = get_my_data_ptr(get_dispatch_key(gpu), layer_data_map);
         layer_data *my_device_data = get_my_data_ptr(get_dispatch_key(*pDevice), layer_data_map);
@@ -2146,6 +2146,8 @@
         initDeviceTable(mem_tracker_device_table_map, (const VkBaseLayerObject *) dev);
         return (void *) vkGetDeviceProcAddr;
     }
+    if (!strcmp(funcName, "vkCreateDevice"))
+        return (void*) vkCreateDevice;
     if (!strcmp(funcName, "vkDestroyDevice"))
         return (void*) vkDestroyDevice;
     if (!strcmp(funcName, "vkQueueSubmit"))
@@ -2304,8 +2306,6 @@
         return (void *) vkDestroyInstance;
     if (!strcmp(funcName, "vkCreateInstance"))
         return (void*) vkCreateInstance;
-    if (!strcmp(funcName, "vkCreateDevice"))
-        return (void*) vkCreateDevice;
     if (!strcmp(funcName, "vkGetPhysicalDeviceExtensionCount"))
         return (void*) vkGetGlobalExtensionCount;
     if (!strcmp(funcName, "vkGetPhysicalDeviceExtensionProperties"))
diff --git a/layers/multi.cpp b/layers/multi.cpp
index 5f86b88..28dacf9 100644
--- a/layers/multi.cpp
+++ b/layers/multi.cpp
@@ -306,9 +306,8 @@
 VK_LAYER_EXPORT VkResult VKAPI multi2CreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo* pCreateInfo,
                                                       VkDevice* pDevice)
 {
-    VkLayerInstanceDispatchTable **ppDisp = (VkLayerInstanceDispatchTable **) gpu;
     printf("At start of multi2 vkCreateDevice()\n");
-    VkResult result = instance_dispatch_table2(gpu)->CreateDevice(gpu, pCreateInfo, pDevice);
+    VkResult result = device_dispatch_table2(*pDevice)->CreateDevice(gpu, pCreateInfo, pDevice);
     printf("Completed multi2 layer vkCreateDevice()\n");
     return result;
 }
@@ -346,6 +345,8 @@
         getLayer2Table(devw);
         return (void *) multi2GetDeviceProcAddr;
     }
+    if (!strcmp("vkCreateDevice", pName))
+        return (void *) multi2CreateDevice;
     if (!strcmp("vkDestroyDevice", pName))
         return (void *) multi2DestroyDevice;
     if (!strcmp("vkCreateCommandBuffer", pName))
@@ -376,8 +377,6 @@
         return (void *) multi2EnumeratePhysicalDevices;
     if (!strcmp("vkDestroyInstance", pName))
         return (void *) multi2DestroyInstance;
-    if (!strcmp("vkCreateDevice", pName))
-        return (void *) multi2CreateDevice;
     else if (!strcmp("GetGlobalExtensionProperties", pName))
         return (void*) vkGetGlobalExtensionProperties;
     else if (!strcmp("GetGlobalExtensionCount", pName))
diff --git a/layers/object_track.h b/layers/object_track.h
index 643c3b5..329ca10 100644
--- a/layers/object_track.h
+++ b/layers/object_track.h
@@ -528,14 +528,15 @@
     VkDevice                 *pDevice)
 {
     loader_platform_thread_lock_mutex(&objLock);
-    VkLayerInstanceDispatchTable *pInstanceTable = get_dispatch_table(ObjectTracker_instance_table_map, gpu);
-    VkResult result = pInstanceTable->CreateDevice(gpu, pCreateInfo, pDevice);
+//    VkLayerInstanceDispatchTable *pInstanceTable = get_dispatch_table(ObjectTracker_instance_table_map, gpu);
+    VkLayerDispatchTable *pDeviceTable = get_dispatch_table(ObjectTracker_device_table_map, *pDevice);
+    VkResult result = pDeviceTable->CreateDevice(gpu, pCreateInfo, pDevice);
     if (result == VK_SUCCESS) {
         layer_data *my_instance_data = get_my_data_ptr(get_dispatch_key(gpu), layer_data_map);
         //// VkLayerDispatchTable *pTable = get_dispatch_table(ObjectTracker_device_table_map, *pDevice);
         layer_data *my_device_data = get_my_data_ptr(get_dispatch_key(*pDevice), layer_data_map);
         my_device_data->report_data = layer_debug_report_create_device(my_instance_data->report_data, *pDevice);
-        create_obj(gpu, *pDevice, VK_OBJECT_TYPE_DEVICE);
+        create_obj(*pDevice, *pDevice, VK_OBJECT_TYPE_DEVICE);
     }
 
     loader_platform_thread_unlock_mutex(&objLock);
diff --git a/layers/param_checker.cpp b/layers/param_checker.cpp
index 1645102..70ced01 100644
--- a/layers/param_checker.cpp
+++ b/layers/param_checker.cpp
@@ -1801,7 +1801,7 @@
     const VkDeviceCreateInfo* pCreateInfo,
     VkDevice* pDevice)
 {
-    VkLayerInstanceDispatchTable *pTable = get_dispatch_table(pc_instance_table_map, physicalDevice);
+    VkLayerDispatchTable *pTable = get_dispatch_table(pc_device_table_map, *pDevice);
     VkResult result = pTable->CreateDevice(physicalDevice, pCreateInfo, pDevice);
     if(result == VK_SUCCESS)
     {
@@ -9672,6 +9672,8 @@
         return (void*) vkGetDeviceProcAddr;
     }
 
+    if (!strcmp(funcName, "vkCreateDevice"))
+        return (void*) vkCreateDevice;
     if (!strcmp(funcName, "vkDestroyDevice"))
         return (void*) vkDestroyDevice;
     if (!strcmp(funcName, "vkGetDeviceQueue"))
@@ -9882,8 +9884,6 @@
         return (void*) vkCreateInstance;
     if (!strcmp(funcName, "vkDestroyInstance"))
         return (void*) vkDestroyInstance;
-    if (!strcmp(funcName, "vkCreateDevice"))
-        return (void*) vkCreateDevice;
     if (!strcmp(funcName, "vkEnumeratePhysicalDevices"))
         return (void*) vkEnumeratePhysicalDevices;
     if (!strcmp(funcName, "vkGetPhysicalDeviceExtensionCount"))
diff --git a/layers/screenshot.cpp b/layers/screenshot.cpp
index d46205b..f7927df 100644
--- a/layers/screenshot.cpp
+++ b/layers/screenshot.cpp
@@ -311,8 +311,8 @@
     const VkDeviceCreateInfo *pCreateInfo,
     VkDevice                 *pDevice)
 {
-    VkLayerInstanceDispatchTable *pInstanceTable = get_dispatch_table(screenshot_instance_table_map, gpu);
-    VkResult result = pInstanceTable->CreateDevice(gpu, pCreateInfo, pDevice);
+    VkLayerDispatchTable *pDisp  = get_dispatch_table(screenshot_device_table_map, *pDevice);
+    VkResult result = pDisp->CreateDevice(gpu, pCreateInfo, pDevice);
 
     if (result == VK_SUCCESS) {
         createDeviceRegisterExtensions(pCreateInfo, *pDevice);
@@ -321,6 +321,8 @@
     return result;
 }
 
+/* TODO: Probably need a DestroyDevice as well */
+
 #define SCREENSHOT_LAYER_EXT_ARRAY_SIZE 2
 static const VkExtensionProperties ssExts[SCREENSHOT_LAYER_EXT_ARRAY_SIZE] = {
     {
@@ -561,6 +563,9 @@
         initDeviceTable(screenshot_device_table_map, (const VkBaseLayerObject *) dev);
         return (void *) vkGetDeviceProcAddr;
     }
+    if (!strcmp(funcName, "vkCreateDevice"))
+        return (void*) vkCreateDevice;
+
     if (!strcmp(funcName, "vkGetDeviceQueue"))
         return (void*) vkGetDeviceQueue;
 
@@ -610,8 +615,6 @@
         return (void *) vkDestroyInstance;
     if (!strcmp(funcName, "vkCreateInstance"))
         return (void*) vkCreateInstance;
-    if (!strcmp(funcName, "vkCreateDevice"))
-        return (void*) vkCreateDevice;
 
     if (get_dispatch_table(screenshot_instance_table_map, instance)->GetInstanceProcAddr == NULL)
         return NULL;
diff --git a/loader/loader.c b/loader/loader.c
index 00b46d7..30a5101 100644
--- a/loader/loader.c
+++ b/loader/loader.c
@@ -92,7 +92,6 @@
     .GetPhysicalDeviceFeatures = loader_GetPhysicalDeviceFeatures,
     .GetPhysicalDeviceFormatInfo = loader_GetPhysicalDeviceFormatInfo,
     .GetPhysicalDeviceLimits = loader_GetPhysicalDeviceLimits,
-    .CreateDevice = loader_CreateDevice,
     .GetPhysicalDeviceProperties = loader_GetPhysicalDeviceProperties,
     .GetPhysicalDevicePerformance = loader_GetPhysicalDevicePerformance,
     .GetPhysicalDeviceQueueCount = loader_GetPhysicalDeviceQueueCount,
@@ -1440,12 +1439,37 @@
                 ext_list);
 }
 
+static VkResult scratch_vkCreateDevice(
+    VkPhysicalDevice          gpu,
+    const VkDeviceCreateInfo *pCreateInfo,
+    VkDevice                 *pDevice)
+{
+    return VK_SUCCESS;
+}
+
+static void * VKAPI loader_GetDeviceChainProcAddr(VkDevice device, const char * name)
+{
+    const VkLayerDispatchTable *disp_table = * (VkLayerDispatchTable **) device;
+
+    /* CreateDevice workaround: Make the terminator be a scratch function
+     * that does nothing since we have already called the ICD's create device.
+     * We can then call down the device chain and have all the layers get set up.
+     */
+    if (!strcmp(name, "vkGetDeviceProcAddr"))
+        return (void *) loader_GetDeviceChainProcAddr;
+    if (!strcmp(name, "vkCreateDevice"))
+        return (void *) scratch_vkCreateDevice;
+
+    return disp_table->GetDeviceProcAddr(device, name);
+}
+
 static uint32_t loader_activate_device_layers(
-            VkDevice device,
-            struct loader_device *dev,
-            struct loader_icd *icd,
-            uint32_t ext_count,
-            const VkExtensionProperties *ext_props)
+        VkPhysicalDevice gpu,
+        VkDevice device,
+        struct loader_device *dev,
+        struct loader_icd *icd,
+        uint32_t ext_count,
+        const VkExtensionProperties *ext_props)
 {
     if (!icd)
         return 0;
@@ -1458,8 +1482,9 @@
     VkObject nextObj = (VkObject) device;
     VkObject baseObj = nextObj;
     VkBaseLayerObject *nextGpuObj;
-    PFN_vkGetDeviceProcAddr nextGPA = icd->GetDeviceProcAddr;
+    PFN_vkGetDeviceProcAddr nextGPA = loader_GetDeviceChainProcAddr;
     VkBaseLayerObject *wrappedGpus;
+
     /*
      * Figure out how many actual layers will need to be wrapped.
      */
@@ -1482,6 +1507,7 @@
         loader_log(VK_DBG_REPORT_ERROR_BIT, 0, "Failed to malloc Gpu objects for layer");
         return 0;
     }
+
     for (int32_t i = dev->activated_layer_list.count - 1; i >= 0; i--) {
 
         struct loader_extension_property *ext_prop = &dev->activated_layer_list.list[i];
@@ -1838,50 +1864,58 @@
     uint32_t gpu_index;
     struct loader_icd *icd = loader_get_icd(gpu, &gpu_index);
     struct loader_device *dev;
-    VkResult res = VK_ERROR_INITIALIZATION_FAILED;
+    VkResult res;
 
-    if (icd->CreateDevice) {
-        res = icd->CreateDevice(gpu, pCreateInfo, pDevice);
-        if (res != VK_SUCCESS) {
-            return res;
-        }
-        dev = loader_add_logical_device(*pDevice, &icd->logical_device_list);
-        if (dev == NULL) {
-            return VK_ERROR_OUT_OF_HOST_MEMORY;
-        }
-        PFN_vkGetDeviceProcAddr get_proc_addr = icd->GetDeviceProcAddr;
-        loader_init_device_dispatch_table(&dev->loader_dispatch, get_proc_addr,
-                                          icd->gpus[gpu_index], icd->gpus[gpu_index]);
-
-        loader_init_dispatch(*pDevice, &dev->loader_dispatch);
-
-        dev->app_extension_count = pCreateInfo->extensionCount;
-        dev->app_extension_props = (VkExtensionProperties *) malloc(sizeof(VkExtensionProperties) * pCreateInfo->extensionCount);
-        if (dev->app_extension_props == NULL && (dev->app_extension_count > 0)) {
-            return VK_ERROR_OUT_OF_HOST_MEMORY;
-        }
-
-        /* Make local copy of extension list */
-        if (dev->app_extension_count > 0 && dev->app_extension_props != NULL) {
-            memcpy(dev->app_extension_props, pCreateInfo->pEnabledExtensions, sizeof(VkExtensionProperties) * pCreateInfo->extensionCount);
-        }
-
-        /*
-         * Put together the complete list of extensions to enable
-         * This includes extensions requested via environment variables.
-         */
-        loader_enable_device_layers(dev, &icd->device_extension_cache[gpu_index]);
-
-        /*
-         * Load the libraries needed by the extensions on the
-         * enabled extension list. This will build the device chain
-         * terminating with the selected device.
-         */
-        loader_activate_device_layers(*pDevice, dev, icd,
-                                      dev->app_extension_count,
-                                      dev->app_extension_props);
+    if (!icd->CreateDevice) {
+        return VK_ERROR_INITIALIZATION_FAILED;
     }
 
+    res = icd->CreateDevice(gpu, pCreateInfo, pDevice);
+    if (res != VK_SUCCESS) {
+        return res;
+    }
+
+    dev = loader_add_logical_device(*pDevice, &icd->logical_device_list);
+    if (dev == NULL) {
+        return VK_ERROR_OUT_OF_HOST_MEMORY;
+    }
+    PFN_vkGetDeviceProcAddr get_proc_addr = icd->GetDeviceProcAddr;
+    loader_init_device_dispatch_table(&dev->loader_dispatch, get_proc_addr,
+                                      icd->gpus[gpu_index], icd->gpus[gpu_index]);
+
+    dev->loader_dispatch.CreateDevice = scratch_vkCreateDevice;
+    loader_init_dispatch(*pDevice, &dev->loader_dispatch);
+
+    dev->app_extension_count = pCreateInfo->extensionCount;
+    dev->app_extension_props = (VkExtensionProperties *) malloc(sizeof(VkExtensionProperties) * pCreateInfo->extensionCount);
+    if (dev->app_extension_props == NULL && (dev->app_extension_count > 0)) {
+        return VK_ERROR_OUT_OF_HOST_MEMORY;
+    }
+
+    /* Make local copy of extension list */
+    if (dev->app_extension_count > 0 && dev->app_extension_props != NULL) {
+        memcpy(dev->app_extension_props, pCreateInfo->pEnabledExtensions, sizeof(VkExtensionProperties) * pCreateInfo->extensionCount);
+    }
+
+    /*
+     * Put together the complete list of extensions to enable
+     * This includes extensions requested via environment variables.
+     */
+    loader_enable_device_layers(dev, &icd->device_extension_cache[gpu_index]);
+
+    /*
+     * Load the libraries needed by the extensions on the
+     * enabled extension list. This will build the device chain
+     * terminating with the selected device.
+     */
+    loader_activate_device_layers(gpu, *pDevice, dev, icd,
+                                  dev->app_extension_count,
+                                  dev->app_extension_props);
+
+    res = dev->loader_dispatch.CreateDevice(gpu, pCreateInfo, pDevice);
+
+    dev->loader_dispatch.CreateDevice = icd->CreateDevice;
+
     return res;
 }
 
diff --git a/loader/table_ops.h b/loader/table_ops.h
index 6d15950..1bfb632 100644
--- a/loader/table_ops.h
+++ b/loader/table_ops.h
@@ -37,6 +37,7 @@
     //then use the gpa in their dispatch for subsequent layers in the chain
     table->GetDeviceProcAddr = (PFN_vkGetDeviceProcAddr) gpa(dev_next, "vkGetDeviceProcAddr");
 
+    table->CreateDevice = (PFN_vkCreateDevice) gpa(dev, "vkCreateDevice");
     table->DestroyDevice = (PFN_vkDestroyDevice) gpa(dev, "vkDestroyDevice");
     table->GetDeviceQueue = (PFN_vkGetDeviceQueue) gpa(dev, "vkGetDeviceQueue");
     table->QueueSubmit = (PFN_vkQueueSubmit) gpa(dev, "vkQueueSubmit");
@@ -151,6 +152,8 @@
     name += 2;
     if (!strcmp(name, "GetDeviceProcAddr"))
         return (void *) table->GetDeviceProcAddr;
+    if (!strcmp(name, "CreateDevice"))
+        return (void *) table->CreateDevice;
     if (!strcmp(name, "DestroyDevice"))
         return (void *) table->DestroyDevice;
     if (!strcmp(name, "GetDeviceQueue"))
@@ -360,7 +363,6 @@
     table->GetPhysicalDeviceFeatures = (PFN_vkGetPhysicalDeviceFeatures) gpa(inst, "vkGetPhysicalDeviceFeatures");
     table->GetPhysicalDeviceFormatInfo = (PFN_vkGetPhysicalDeviceFormatInfo) gpa(inst, "vkGetPhysicalDeviceFormatInfo");
     table->GetPhysicalDeviceLimits = (PFN_vkGetPhysicalDeviceLimits) gpa(inst, "vkGetPhysicalDeviceLimits");
-    table->CreateDevice = (PFN_vkCreateDevice) gpa(inst, "vkCreateDevice");
     table->GetPhysicalDeviceProperties = (PFN_vkGetPhysicalDeviceProperties) gpa(inst, "vkGetPhysicalDeviceProperties");
     table->GetPhysicalDevicePerformance = (PFN_vkGetPhysicalDevicePerformance) gpa(inst, "vkGetPhysicalDevicePerformance");
     table->GetPhysicalDeviceQueueCount = (PFN_vkGetPhysicalDeviceQueueCount) gpa(inst, "vkGetPhysicalDeviceQueueCount");
@@ -411,8 +413,6 @@
         return (void *) table->GetPhysicalDeviceMemoryProperties;
     if (!strcmp(name, "GetInstanceProcAddr"))
         return (void *) table->GetInstanceProcAddr;
-    if (!strcmp(name, "CreateDevice"))
-        return (void *) table->CreateDevice;
     if (!strcmp(name, "GetPhysicalDeviceExtensionCount"))
         return (void *) table->GetPhysicalDeviceExtensionCount;
     if (!strcmp(name, "GetPhysicalDeviceExtensionProperties"))
diff --git a/loader/trampoline.c b/loader/trampoline.c
index ec36373..d6a09ef 100644
--- a/loader/trampoline.c
+++ b/loader/trampoline.c
@@ -277,18 +277,16 @@
 }
 
 LOADER_EXPORT VkResult VKAPI vkCreateDevice(
-                                        VkPhysicalDevice gpu,
-                                        const VkDeviceCreateInfo* pCreateInfo,
-                                        VkDevice* pDevice)
+        VkPhysicalDevice gpu,
+        const VkDeviceCreateInfo* pCreateInfo,
+        VkDevice* pDevice)
 {
-    const VkLayerInstanceDispatchTable *disp;
     VkResult res;
 
-    disp = loader_get_instance_dispatch(gpu);
-
     loader_platform_thread_lock_mutex(&loader_lock);
-    // CreateDevice is dispatched on the instance chain
-    res = disp->CreateDevice(gpu, pCreateInfo, pDevice);
+
+    res = loader_CreateDevice(gpu, pCreateInfo, pDevice);
+
     loader_platform_thread_unlock_mutex(&loader_lock);
     return res;
 }
diff --git a/vk-generate.py b/vk-generate.py
index 51f56ad..d6f9c8d 100755
--- a/vk-generate.py
+++ b/vk-generate.py
@@ -122,7 +122,7 @@
             stmts.append("memset(table, 0, sizeof(*table));")
             stmts.append("table->GetDeviceProcAddr =(PFN_vkGetDeviceProcAddr)  gpa(device,\"vkGetDeviceProcAddr\");")
             for proto in self.protos:
-                if proto.name == "CreateInstance" or proto.name == "GetGlobalExtensionProperties" or proto.name == "GetGlobalExtensionCount" or proto.params[0].ty == "VkInstance" or proto.params[0].ty == "VkPhysicalDevice":
+                if proto.name == "CreateInstance" or proto.name == "GetGlobalExtensionProperties" or proto.name == "GetGlobalExtensionCount" or proto.params[0].ty == "VkInstance" or (proto.params[0].ty == "VkPhysicalDevice" and proto.name != "CreateDevice"):
                     continue
                 if proto.name != "GetDeviceProcAddr":
                     stmts.append("table->%s = (PFN_vk%s) gpa(baseDevice, \"vk%s\");" %
@@ -141,6 +141,8 @@
             for proto in self.protos:
                 if proto.name != "CreateInstance"  and proto.params[0].ty != "VkInstance" and proto.params[0].ty != "VkPhysicalDevice":
                     continue
+                if proto.name == "CreateDevice":
+                    continue
                 if proto.name != "GetInstanceProcAddr":
                     stmts.append("table->%s = (PFN_vk%s) gpa(baseInstance, \"vk%s\");" %
                           (proto.name, proto.name, proto.name))
diff --git a/vk-layer-generate.py b/vk-layer-generate.py
index a9973c0..34340b5 100755
--- a/vk-layer-generate.py
+++ b/vk-layer-generate.py
@@ -375,6 +375,8 @@
 
             if not proto in intercepted:
                 continue
+            if proto.name == "CreateDevice":
+                continue
             lookups.append("if (!strcmp(name, \"%s\"))" % proto.name)
             lookups.append("    return (void*) %s%s;" % (prefix, proto.name))
 
@@ -642,7 +644,7 @@
                      '    char str[1024];\n'
                      '    sprintf(str, "At start of layered %s\\n");\n'
                      '    layerCbMsg(VK_DBG_REPORT_INFO_BIT,VK_OBJECT_TYPE_PHYSICAL_DEVICE, gpu, 0, 0, (char *) "GENERIC", (char *) str);\n'
-                     '    %sinstance_dispatch_table(gpu)->%s;\n'
+                     '    %sdevice_dispatch_table(*pDevice)->%s;\n'
                      '    if (result == VK_SUCCESS) {\n'
                      '        enable_debug_report(pCreateInfo->extensionCount, pCreateInfo->pEnabledExtensions);\n'
                      '        createDeviceRegisterExtensions(pCreateInfo, *pDevice);\n'
@@ -1010,12 +1012,12 @@
             funcs.append('%s%s\n'
                      '{\n'
                      '    using namespace StreamControl;\n'
-                     '    %s%s_dispatch_table(%s)->%s;\n'
+                     '    %sdevice_dispatch_table(*pDevice)->%s;\n'
                      '    if (result == VK_SUCCESS)\n'
                      '        createDeviceRegisterExtensions(pCreateInfo, *pDevice);\n'
                      '    %s%s%s\n'
                      '%s'
-                     '}' % (qual, decl, ret_val, table_type, dispatch_param, proto.c_call(), f_open, log_func, f_close, stmt))
+                     '}' % (qual, decl, ret_val, proto.c_call(), f_open, log_func, f_close, stmt))
         elif proto.name == "DestroyDevice":
             funcs.append('%s%s\n'
                  '{\n'
@@ -1308,7 +1310,15 @@
         if 'WSI' in proto.name:
             return None
         # Initialize in early calls
-        if proto.params[0].ty == "VkPhysicalDevice":
+        if proto.name == "CreateDevice":
+            funcs.append('%s' % self.lineinfo.get())
+            funcs.append('%s%s\n'
+                     '{\n'
+                     '    %sdevice_dispatch_table(*pDevice)->%s;\n'
+                     '%s'
+                     '}' % (qual, decl, ret_val, proto.c_call(), stmt))
+            return "\n".join(funcs)
+        elif proto.params[0].ty == "VkPhysicalDevice":
             funcs.append('%s' % self.lineinfo.get())
             funcs.append('%s%s\n'
                      '{\n'