Chao Mei | 6ddfc60 | 2020-05-13 22:29:36 -0700 | [diff] [blame] | 1 | // Copyright 2020 Google LLC |
| 2 | // |
| 3 | // This source code is licensed under the BSD-style license found in the |
| 4 | // LICENSE file in the root directory of this source tree. |
| 5 | |
| 6 | #include <assert.h> |
| 7 | #include <stdbool.h> |
| 8 | #include <stdint.h> |
| 9 | #include <stdlib.h> |
| 10 | |
| 11 | #include <xnnpack/memory-planner.h> |
| 12 | #include <xnnpack/subgraph.h> |
| 13 | |
| 14 | // Check if two xnn_value's lifecycles overlap. |
| 15 | inline static bool value_lifecycle_overlap(const struct xnn_value_usage* a, const struct xnn_value_usage* b) { |
| 16 | assert(a->last_node >= a->first_node); |
| 17 | assert(b->last_node >= b->first_node); |
| 18 | if (a->first_node < b->first_node) { |
| 19 | return a->last_node >= b->first_node; |
| 20 | } else { |
| 21 | return b->last_node >= a->first_node; |
| 22 | } |
| 23 | } |
| 24 | |
| 25 | // Use this comparison function to sort xnn_value_usage according to the |
| 26 | // tensor_size in decreasing order. |
| 27 | static inline int cmp_value_usage_tensor_size(const void* a, const void* b) { |
| 28 | const size_t tensor_size_a = (*(struct xnn_value_usage**)a)->tensor_size; |
| 29 | const size_t tensor_size_b = (*(struct xnn_value_usage**)b)->tensor_size; |
| 30 | return (tensor_size_b > tensor_size_a) - (tensor_size_b < tensor_size_a); |
| 31 | } |
| 32 | |
| 33 | static void populate_value_lifecycle(const xnn_subgraph_t subgraph, struct xnn_value_usage* usage) { |
| 34 | assert(subgraph != NULL); |
| 35 | if (subgraph->num_nodes == 0) { |
| 36 | return; |
| 37 | } |
| 38 | // As we initialized first/last_node in each xnn_value_usage to 0 as in 'xnn_init_value_mem_allocation_tracker', |
| 39 | // we start with the second node to tell whether first/last_node have been set or not, and check the first node last. |
| 40 | for (uint32_t nid = 1; nid < subgraph->num_nodes; ++nid) { |
| 41 | const struct xnn_node* node = subgraph->nodes + nid; |
| 42 | for (uint32_t i = 0; i < node->num_inputs; ++i) { |
| 43 | if (usage[node->inputs[i]].first_node == 0) { |
| 44 | usage[node->inputs[i]].first_node = nid; |
| 45 | } |
| 46 | usage[node->inputs[i]].last_node = nid; |
| 47 | } |
| 48 | for (uint32_t i = 0; i < node->num_outputs; ++i) { |
| 49 | if (usage[node->outputs[i]].first_node == 0) { |
| 50 | usage[node->outputs[i]].first_node = nid; |
| 51 | } |
| 52 | usage[node->outputs[i]].last_node = nid; |
| 53 | } |
| 54 | } |
| 55 | const struct xnn_node* first_node = subgraph->nodes; |
| 56 | for (uint32_t i = 0; i < first_node->num_inputs; ++i) { |
| 57 | usage[first_node->inputs[i]].first_node = 0; |
| 58 | } |
| 59 | for (uint32_t i = 0; i < first_node->num_outputs; ++i) { |
| 60 | usage[first_node->outputs[i]].first_node = 0; |
| 61 | } |
| 62 | } |
| 63 | |
| 64 | // Represent a memory block [start, end) |
| 65 | struct memory_block { |
| 66 | size_t start; |
| 67 | size_t end; |
| 68 | }; |
| 69 | |
| 70 | // Use this comparison function to sort memory_block according to the 'start' |
| 71 | // in increasing order. |
| 72 | static inline int cmp_memory_block(const void* a, const void* b) { |
| 73 | const size_t start_a = ((struct memory_block*)a)->start; |
| 74 | const size_t start_b = ((struct memory_block*)b)->start; |
| 75 | return (start_a > start_b) - (start_a < start_b); |
| 76 | } |
| 77 | |
| 78 | // Given the current live memory blocks, return the offset in a memory arena for a to-be-allocated value of size |
| 79 | // 'to_alloc_size'. |
| 80 | static size_t find_value_alloc_offset(struct memory_block* live_mem_blocks, |
| 81 | size_t num_mem_blocks, |
| 82 | size_t to_alloc_size) { |
| 83 | if (num_mem_blocks == 0) { |
| 84 | return 0; |
| 85 | } |
| 86 | |
| 87 | if (num_mem_blocks == 1) { |
| 88 | return live_mem_blocks[0].end; |
| 89 | } |
| 90 | |
| 91 | // Sort memory blocks according to 'start' in increasing order. |
| 92 | qsort(live_mem_blocks, num_mem_blocks, sizeof(struct memory_block), cmp_memory_block); |
| 93 | |
| 94 | // Coalesce overlapping or immediate adjacent memory blocks to form a list of non-overlapping memory blocks in order |
| 95 | // to find the smallest gap. |
| 96 | size_t num_coalesced_mem_blocks = 1; |
| 97 | for (size_t i = 1; i < num_mem_blocks; ++i) { |
| 98 | const size_t current_coalesced_end = |
| 99 | live_mem_blocks[num_coalesced_mem_blocks - 1].end; |
| 100 | if (live_mem_blocks[i].start > current_coalesced_end) { |
| 101 | assert(num_coalesced_mem_blocks <= i); |
| 102 | live_mem_blocks[num_coalesced_mem_blocks] = live_mem_blocks[i]; |
| 103 | num_coalesced_mem_blocks++; |
| 104 | continue; |
| 105 | } |
| 106 | if (live_mem_blocks[i].end > current_coalesced_end) { |
| 107 | live_mem_blocks[num_coalesced_mem_blocks - 1].end = live_mem_blocks[i].end; |
| 108 | } |
| 109 | } |
| 110 | |
| 111 | size_t smallest_gap_size = SIZE_MAX; |
| 112 | // The first index to live_mem_blocks that the 'to_alloc_size' should be allocated after. |
| 113 | size_t smallest_gap_index = num_coalesced_mem_blocks - 1; |
| 114 | for (size_t i = 0; i < num_coalesced_mem_blocks - 1; ++i) { |
| 115 | assert(live_mem_blocks[i + 1].start > live_mem_blocks[i].end); |
| 116 | const size_t gap = live_mem_blocks[i + 1].start - live_mem_blocks[i].end; |
| 117 | if (gap >= to_alloc_size && gap < smallest_gap_size) { |
| 118 | smallest_gap_index = i; |
| 119 | smallest_gap_size = gap; |
| 120 | } |
| 121 | } |
| 122 | return live_mem_blocks[smallest_gap_index].end; |
| 123 | } |
| 124 | |
| 125 | void xnn_init_value_allocation_tracker(struct xnn_value_allocation_tracker* tracker, const xnn_subgraph_t subgraph) { |
| 126 | tracker->subgraph = subgraph; |
| 127 | tracker->mem_arena_size = 0; |
| 128 | tracker->usage = xnn_allocate_zero_memory(sizeof(struct xnn_value_usage) * subgraph->num_values); |
| 129 | #if XNN_ENABLE_MEMOPT |
| 130 | populate_value_lifecycle(tracker->subgraph, tracker->usage); |
| 131 | #endif |
| 132 | tracker->min_value_id = XNN_INVALID_VALUE_ID; |
| 133 | tracker->max_value_id = XNN_INVALID_VALUE_ID; |
| 134 | } |
| 135 | |
| 136 | void xnn_add_value_allocation_tracker(struct xnn_value_allocation_tracker* tracker, |
| 137 | uint32_t value_id, |
| 138 | size_t tensor_size) { |
| 139 | tracker->usage[value_id].tensor_size = tensor_size; |
| 140 | if (tracker->min_value_id == XNN_INVALID_VALUE_ID) { |
| 141 | tracker->min_value_id = value_id; |
| 142 | } else { |
| 143 | // Note that values are expected to be added in increasing order. |
| 144 | assert(value_id > tracker->min_value_id); |
| 145 | assert(value_id > tracker->max_value_id); |
| 146 | } |
| 147 | |
| 148 | tracker->max_value_id = value_id; |
| 149 | } |
| 150 | |
| 151 | void xnn_plan_value_allocation_tracker(struct xnn_value_allocation_tracker* tracker) { |
| 152 | #if XNN_ENABLE_MEMOPT |
| 153 | if (tracker->min_value_id == XNN_INVALID_VALUE_ID) { |
| 154 | assert(tracker->max_value_id == XNN_INVALID_VALUE_ID); |
| 155 | return; |
| 156 | } |
| 157 | |
| 158 | const uint32_t num_values = tracker->max_value_id - tracker->min_value_id + 1; |
| 159 | struct xnn_value_usage** sorted_usage = xnn_allocate_zero_memory(sizeof(struct xnn_value_usage*) * num_values); |
| 160 | size_t num_values_to_alloc = 0; |
| 161 | for (size_t i = tracker->min_value_id; i <= tracker->max_value_id; ++i) { |
| 162 | struct xnn_value_usage* info = tracker->usage + i; |
| 163 | if (info->tensor_size != 0) { |
| 164 | sorted_usage[num_values_to_alloc++] = info; |
| 165 | } |
| 166 | } |
| 167 | qsort(sorted_usage, num_values_to_alloc, sizeof(struct xnn_value_usage*), cmp_value_usage_tensor_size); |
| 168 | |
| 169 | // Start the allocation planning process. |
| 170 | struct memory_block* current_live_mem_blocks = xnn_allocate_zero_memory( |
| 171 | sizeof(struct memory_block) * num_values_to_alloc); |
| 172 | size_t mem_arena_size = 0; |
| 173 | for (size_t i = 0; i < num_values_to_alloc; ++i) { |
| 174 | size_t num_live_mem_blocks = 0; |
| 175 | struct xnn_value_usage* current = sorted_usage[i]; |
| 176 | for (size_t j = 0; j < i; ++j) { |
| 177 | const struct xnn_value_usage* allocated = sorted_usage[j]; |
| 178 | if (value_lifecycle_overlap(current, allocated)) { |
| 179 | current_live_mem_blocks[num_live_mem_blocks++] = (struct memory_block){ |
| 180 | .start = allocated->alloc_offset, |
| 181 | .end = allocated->alloc_offset + allocated->tensor_size, |
| 182 | }; |
| 183 | } |
| 184 | } |
| 185 | current->alloc_offset = find_value_alloc_offset(current_live_mem_blocks, num_live_mem_blocks, current->tensor_size); |
| 186 | if (mem_arena_size < current->alloc_offset + current->tensor_size) { |
| 187 | mem_arena_size = current->alloc_offset + current->tensor_size; |
| 188 | } |
| 189 | } |
| 190 | |
| 191 | tracker->mem_arena_size = mem_arena_size; |
| 192 | xnn_release_memory(sorted_usage); |
| 193 | xnn_release_memory(current_live_mem_blocks); |
| 194 | #else |
| 195 | tracker->mem_arena_size = 0; |
| 196 | for (uint32_t i = tracker->min_value_id; i <= tracker->max_value_id; ++i) { |
| 197 | if (tracker->usage[i].tensor_size > 0) { |
| 198 | tracker->usage[i].alloc_offset = tracker->mem_arena_size; |
| 199 | tracker->mem_arena_size += tracker->usage[i].tensor_size; |
| 200 | } |
| 201 | } |
| 202 | #endif |
| 203 | } |