arm_compute v18.05
diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp
index 50b0f0e..7f0e374 100644
--- a/src/runtime/Allocator.cpp
+++ b/src/runtime/Allocator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,8 +22,10 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/Allocator.h"
+#include "arm_compute/runtime/MemoryRegion.h"
#include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
#include <cstddef>
@@ -39,3 +41,9 @@
{
::operator delete(ptr);
}
+
+std::unique_ptr<IMemoryRegion> Allocator::make_region(size_t size, size_t alignment)
+{
+ ARM_COMPUTE_UNUSED(alignment);
+ return arm_compute::support::cpp14::make_unique<MemoryRegion>(size);
+}
\ No newline at end of file
diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
index 3ca5071..2a4ab6e 100644
--- a/src/runtime/BlobLifetimeManager.cpp
+++ b/src/runtime/BlobLifetimeManager.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -57,15 +57,15 @@
ARM_COMPUTE_ERROR_ON(!are_all_finalized());
ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
- // Sort active group requirements in descending order.
- std::sort(std::begin(_active_elements), std::end(_active_elements), [](const Element & a, const Element & b)
+ // Sort free blobs requirements in descending order.
+ _free_blobs.sort([](const Blob & ba, const Blob & bb)
{
- return a.size > b.size;
+ return ba.max_size > bb.max_size;
});
std::vector<size_t> group_sizes;
- std::transform(std::begin(_active_elements), std::end(_active_elements), std::back_inserter(group_sizes), [](const Element & e)
+ std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), [](const Blob & b)
{
- return e.size;
+ return b.max_size;
});
// Update blob sizes
@@ -80,8 +80,14 @@
// Calculate group mappings
auto &group_mappings = _active_group->mappings();
int blob_idx = 0;
- for(auto &e : _active_elements)
+ for(auto &free_blob : _free_blobs)
{
- group_mappings[e.handle] = blob_idx++;
+ for(auto &bound_element_id : free_blob.bound_elements)
+ {
+ ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements));
+ Element &bound_element = _active_elements[bound_element_id];
+ group_mappings[bound_element.handle] = blob_idx;
+ }
+ ++blob_idx;
}
}
diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp
index 9a5c13a..84789e7 100644
--- a/src/runtime/CL/CLBufferAllocator.cpp
+++ b/src/runtime/CL/CLBufferAllocator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,9 +22,11 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/CL/CLBufferAllocator.h"
+#include "arm_compute/runtime/CL/CLMemoryRegion.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
#include <cstddef>
@@ -47,3 +49,9 @@
ARM_COMPUTE_ERROR_ON(ptr == nullptr);
clReleaseMemObject(static_cast<cl_mem>(ptr));
}
+
+std::unique_ptr<IMemoryRegion> CLBufferAllocator::make_region(size_t size, size_t alignment)
+{
+ ARM_COMPUTE_UNUSED(alignment);
+ return arm_compute::support::cpp14::make_unique<CLBufferMemoryRegion>(_context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
+}
diff --git a/src/runtime/CL/CLHOG.cpp b/src/runtime/CL/CLHOG.cpp
index 3f5266c..c4ea639 100644
--- a/src/runtime/CL/CLHOG.cpp
+++ b/src/runtime/CL/CLHOG.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -74,11 +74,11 @@
uint8_t *CLHOG::do_map(cl::CommandQueue &q, bool blocking)
{
ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
- return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->descriptor_size()));
+ return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->descriptor_size() * sizeof(float)));
}
void CLHOG::do_unmap(cl::CommandQueue &q)
{
ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
q.enqueueUnmapMemObject(_buffer, descriptor());
-}
\ No newline at end of file
+}
diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
new file mode 100644
index 0000000..534c4f9
--- /dev/null
+++ b/src/runtime/CL/CLMemory.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLMemory.h"
+
+#include "arm_compute/core/Error.h"
+
+namespace arm_compute
+{
+CLMemory::CLMemory()
+ : _region(nullptr), _region_owned(nullptr)
+{
+ create_empty_region();
+}
+
+CLMemory::CLMemory(std::shared_ptr<ICLMemoryRegion> memory)
+ : _region(nullptr), _region_owned(std::move(memory))
+{
+ if(_region_owned == nullptr)
+ {
+ create_empty_region();
+ }
+ _region = _region_owned.get();
+}
+
+CLMemory::CLMemory(ICLMemoryRegion *memory)
+ : _region(memory), _region_owned(nullptr)
+{
+ _region = memory;
+}
+
+ICLMemoryRegion *CLMemory::region()
+{
+ return _region;
+}
+
+ICLMemoryRegion *CLMemory::region() const
+{
+ return _region;
+}
+
+void CLMemory::create_empty_region()
+{
+ _region_owned = std::make_shared<CLBufferMemoryRegion>(cl::Context::getDefault(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 0);
+ _region = _region_owned.get();
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp
new file mode 100644
index 0000000..15fd7f3
--- /dev/null
+++ b/src/runtime/CL/CLMemoryRegion.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLMemoryRegion.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+ICLMemoryRegion::ICLMemoryRegion(cl::Context ctx, size_t size)
+ : IMemoryRegion(size), _ctx(std::move(ctx)), _mapping(nullptr), _mem()
+{
+}
+
+const cl::Buffer &ICLMemoryRegion::cl_data() const
+{
+ return _mem;
+}
+
+void *ICLMemoryRegion::buffer()
+{
+ return _mapping;
+}
+
+void *ICLMemoryRegion::buffer() const
+{
+ return _mapping;
+}
+
+void **ICLMemoryRegion::handle()
+{
+ return reinterpret_cast<void **>(&_mem);
+}
+
+CLBufferMemoryRegion::CLBufferMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size)
+ : ICLMemoryRegion(std::move(ctx), size)
+{
+ if(_size != 0)
+ {
+ _mem = cl::Buffer(_ctx, flags, _size);
+ }
+}
+
+void *CLBufferMemoryRegion::ptr()
+{
+ return nullptr;
+}
+
+void *CLBufferMemoryRegion::map(cl::CommandQueue &q, bool blocking)
+{
+ ARM_COMPUTE_ERROR_ON(_mem.get() == nullptr);
+ _mapping = q.enqueueMapBuffer(_mem, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, _size);
+ return _mapping;
+}
+
+void CLBufferMemoryRegion::unmap(cl::CommandQueue &q)
+{
+ ARM_COMPUTE_ERROR_ON(_mem.get() == nullptr);
+ q.enqueueUnmapMemObject(_mem, _mapping);
+ _mapping = nullptr;
+}
+
+ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size, size_t alignment)
+ : ICLMemoryRegion(std::move(ctx), size), _ptr(nullptr)
+{
+ if(size != 0)
+ {
+ _ptr = clSVMAlloc(_ctx.get(), flags, size, alignment);
+ if(_ptr != nullptr)
+ {
+ _mem = cl::Buffer(_ctx, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr);
+ }
+ }
+}
+
+ICLSVMMemoryRegion::~ICLSVMMemoryRegion()
+{
+ if(_ptr != nullptr)
+ {
+ clFinish(CLScheduler::get().queue().get());
+ _mem = cl::Buffer();
+ clSVMFree(_ctx.get(), _ptr);
+ }
+}
+
+void *ICLSVMMemoryRegion::ptr()
+{
+ return _ptr;
+}
+
+CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size, size_t alignment)
+ : ICLSVMMemoryRegion(std::move(ctx), flags, size, alignment)
+{
+}
+
+void *CLCoarseSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
+{
+ ARM_COMPUTE_ERROR_ON(_ptr == nullptr);
+ clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, nullptr);
+ _mapping = _ptr;
+ return _mapping;
+}
+
+void CLCoarseSVMMemoryRegion::unmap(cl::CommandQueue &q)
+{
+ ARM_COMPUTE_ERROR_ON(_ptr == nullptr);
+ clEnqueueSVMUnmap(q.get(), _ptr, 0, nullptr, nullptr);
+ _mapping = nullptr;
+}
+
+CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size, size_t alignment)
+ : ICLSVMMemoryRegion(std::move(ctx), flags, size, alignment)
+{
+}
+
+void *CLFineSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
+{
+ if(blocking)
+ {
+ clFinish(q.get());
+ }
+ _mapping = _ptr;
+ return _mapping;
+}
+
+void CLFineSVMMemoryRegion::unmap(cl::CommandQueue &q)
+{
+ ARM_COMPUTE_UNUSED(q);
+ _mapping = nullptr;
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index 65292fe..fdae615 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -31,7 +31,7 @@
std::once_flag CLScheduler::_initialize_symbols;
CLScheduler::CLScheduler()
- : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner()
+ : _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner()
{
}
@@ -52,7 +52,7 @@
if(_cl_tuner != nullptr)
{
// Tune the OpenCL kernel
- _cl_tuner->tune_kernel(kernel);
+ _cl_tuner->tune_kernel_dynamic(kernel);
}
// Run kernel
diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp
index 5f58024..d0e7d76 100644
--- a/src/runtime/CL/CLSubTensor.cpp
+++ b/src/runtime/CL/CLSubTensor.cpp
@@ -29,6 +29,11 @@
using namespace arm_compute;
+CLSubTensor::CLSubTensor()
+ : _parent(nullptr), _info()
+{
+}
+
CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent)
: _parent(nullptr), _info()
{
diff --git a/src/runtime/CL/CLTensor.cpp b/src/runtime/CL/CLTensor.cpp
index bc513d1..dd27738 100644
--- a/src/runtime/CL/CLTensor.cpp
+++ b/src/runtime/CL/CLTensor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -47,7 +47,7 @@
return _allocator.cl_data();
}
-ITensorAllocator *CLTensor::allocator()
+CLTensorAllocator *CLTensor::allocator()
{
return &_allocator;
}
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index ad165fa..54e7c5b 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,36 +30,57 @@
using namespace arm_compute;
-CLTensorAllocator::CLTensorAllocator(CLTensor *owner)
- : _associated_memory_group(nullptr), _buffer(), _mapping(nullptr), _owner(owner)
+namespace
{
-}
+std::shared_ptr<arm_compute::ICLMemoryRegion> allocate_region(cl::Context context, size_t size, cl_uint alignment)
+{
+ // Try fine-grain SVM
+ std::shared_ptr<ICLMemoryRegion> region = std::make_shared<CLFineSVMMemoryRegion>(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, alignment);
-CLTensorAllocator::~CLTensorAllocator()
+ // Try coarse-grain SVM in case of failure
+ if(region != nullptr && region->ptr() == nullptr)
+ {
+ region = std::make_shared<CLCoarseSVMMemoryRegion>(context, CL_MEM_READ_WRITE, size, alignment);
+ }
+ // Try legacy buffer memory in case of failure
+ if(region != nullptr && region->ptr() == nullptr)
+ {
+ region = std::make_shared<CLBufferMemoryRegion>(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
+ }
+ return region;
+}
+} // namespace
+
+CLTensorAllocator::CLTensorAllocator(CLTensor *owner)
+ : _associated_memory_group(nullptr), _memory(), _owner(owner)
{
- _buffer = cl::Buffer();
}
uint8_t *CLTensorAllocator::data()
{
- return _mapping;
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
}
const cl::Buffer &CLTensorAllocator::cl_data() const
{
- return _buffer;
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ return _memory.region()->cl_data();
}
void CLTensorAllocator::allocate()
{
- ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+
if(_associated_memory_group == nullptr)
{
- _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info().total_size());
+ ARM_COMPUTE_ERROR_ON(_memory.region()->cl_data().get() != nullptr);
+ _memory = CLMemory(allocate_region(CLScheduler::get().context(), info().total_size(), 0));
}
else
{
- _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(&_buffer()), info().total_size());
+ _associated_memory_group->finalize_memory(_owner, _memory.region()->handle(), info().total_size());
+ _memory.region()->set_size(info().total_size());
}
info().set_is_resizable(false);
}
@@ -68,41 +89,55 @@
{
if(_associated_memory_group == nullptr)
{
- _buffer = cl::Buffer();
+ _memory = CLMemory();
info().set_is_resizable(true);
}
}
+arm_compute::Status CLTensorAllocator::import_memory(CLMemory memory)
+{
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ ARM_COMPUTE_RETURN_ERROR_ON(memory.region()->cl_data().get() == nullptr);
+ ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
+ _memory = memory;
+ info().set_is_resizable(false);
+
+ return Status{};
+}
+
void CLTensorAllocator::set_associated_memory_group(CLMemoryGroup *associated_memory_group)
{
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
- ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+ ARM_COMPUTE_ERROR_ON(_memory.region()->cl_data().get() != nullptr);
+ _memory = CLMemory(std::make_shared<CLBufferMemoryRegion>(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 0));
_associated_memory_group = associated_memory_group;
}
uint8_t *CLTensorAllocator::lock()
{
- ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
- _mapping = map(CLScheduler::get().queue(), true);
- return _mapping;
+ return map(CLScheduler::get().queue(), true);
}
void CLTensorAllocator::unlock()
{
- ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
- unmap(CLScheduler::get().queue(), _mapping);
- _mapping = nullptr;
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ unmap(CLScheduler::get().queue(), reinterpret_cast<uint8_t *>(_memory.region()->buffer()));
}
uint8_t *CLTensorAllocator::map(cl::CommandQueue &q, bool blocking)
{
- ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
- return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info().total_size()));
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() != nullptr);
+ _memory.region()->map(q, blocking);
+ return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
}
void CLTensorAllocator::unmap(cl::CommandQueue &q, uint8_t *mapping)
{
- ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
- q.enqueueUnmapMemObject(_buffer, mapping);
+ ARM_COMPUTE_UNUSED(mapping);
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() == nullptr);
+ _memory.region()->unmap(q);
}
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index df8e255..5f82cd3 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -35,61 +35,6 @@
using namespace arm_compute;
-namespace
-{
-/* Function to be used to intercept kernel enqueues and store their OpenCL Event */
-class Interceptor
-{
-public:
- explicit Interceptor(CLTuner &tuner);
-
- /** clEnqueueNDRangeKernel interface
- *
- * @param[in] command_queue A valid command-queue. The kernel will be queued for execution on the device associated with command_queue.
- * @param[in] kernel A valid kernel object. The OpenCL context associated with kernel and command_queue must be the same.
- * @param[in] work_dim The number of dimensions used to specify the global work-items and work-items in the work-group. work_dim must be greater than zero and less than or equal to CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS.
- * @param[in] gwo Global-Workgroup-Offset. It can be used to specify an array of work_dim unsigned values that describe the offset used to calculate the global ID of a work-item. If global_work_offset is NULL, the global IDs start at offset (0, 0, ... 0).
- * @param[in] gws Global-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of global work-items in work_dim dimensions that will execute the kernel function.
- * @param[in] lws Local-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of work-items that make up a work-group
- * @param[in] num_events_in_wait_list Number of events in the waiting list
- * @param[in] event_wait_list Event waiting list
- * @param[in] event OpenCL kernel event
- *
- * @return the OpenCL status
- */
- cl_int operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
- const cl_event *event_wait_list, cl_event *event);
-
-private:
- CLTuner &_tuner;
-};
-
-Interceptor::Interceptor(CLTuner &tuner)
- : _tuner(tuner)
-{
-}
-
-cl_int Interceptor::operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
- const cl_event *event_wait_list, cl_event *event)
-{
- ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported");
- ARM_COMPUTE_UNUSED(event);
- if(_tuner.kernel_event_is_set())
- {
- // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues.
- return CL_SUCCESS;
- }
- cl_event tmp;
- cl_int retval = _tuner.real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
-
- // Set OpenCL event
- _tuner.set_cl_kernel_event(tmp);
-
- return retval;
-}
-
-} // namespace
-
CLTuner::CLTuner(bool tune_new_kernels)
: real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
{
@@ -113,7 +58,12 @@
return _tune_new_kernels;
}
-void CLTuner::tune_kernel(ICLKernel &kernel)
+void CLTuner::tune_kernel_static(ICLKernel &kernel)
+{
+ ARM_COMPUTE_UNUSED(kernel);
+}
+
+void CLTuner::tune_kernel_dynamic(ICLKernel &kernel)
{
// Get the configuration ID from the kernel
const std::string &config_id = kernel.config_id();
@@ -173,7 +123,25 @@
}
}
// Start intercepting enqueues:
- CLSymbols::get().clEnqueueNDRangeKernel_ptr = Interceptor(*this);
+ auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list, cl_event * event)
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported");
+ ARM_COMPUTE_UNUSED(event);
+ if(this->kernel_event_is_set())
+ {
+ // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues.
+ return CL_SUCCESS;
+ }
+ cl_event tmp;
+ cl_int retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
+
+ // Set OpenCL event
+ this->set_cl_kernel_event(tmp);
+
+ return retval;
+ };
+ CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
cl_ulong min_exec_time = std::numeric_limits<cl_ulong>::max();
diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
new file mode 100644
index 0000000..ff50073
--- /dev/null
+++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLChannelShuffleLayer::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLChannelShuffleLayerKernel>();
+ k->configure(input, output, num_groups);
+ _kernel = std::move(k);
+}
+
+Status CLChannelShuffleLayer::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+{
+ return CLChannelShuffleLayerKernel::validate(input, output, num_groups);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
new file mode 100644
index 0000000..c226e56
--- /dev/null
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
+
+using namespace arm_compute;
+
+void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
+ DataLayout data_layout)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLConvertFullyConnectedWeightsKernel>();
+ k->configure(input, output, original_input_shape, data_layout);
+ _kernel = std::move(k);
+}
+
+Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
+ DataLayout data_layout)
+{
+ return CLConvertFullyConnectedWeightsKernel::validate(input, output, original_input_shape, data_layout);
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 1a486ce..47a8d5f 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -42,25 +42,34 @@
{
}
-void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+ const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info));
+ ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
+ enable_fast_math));
- switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
- weights_info, CLScheduler::get().target()))
+ switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info,
+ weights_info, act_info, CLScheduler::get().target(), dilation, enable_fast_math))
{
+ case ConvolutionMethod::WINOGRAD:
+ {
+ auto f = arm_compute::support::cpp14::make_unique<CLWinogradConvolutionLayer>(_memory_manager);
+ f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
+ _function = std::move(f);
+ break;
+ }
case ConvolutionMethod::DIRECT:
{
auto f = arm_compute::support::cpp14::make_unique<CLDirectConvolutionLayer>();
- f->configure(input, weights, biases, output, conv_info);
+ f->configure(input, weights, biases, output, conv_info, act_info);
_function = std::move(f);
break;
}
case ConvolutionMethod::GEMM:
{
auto f = arm_compute::support::cpp14::make_unique<CLGEMMConvolutionLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info, weights_info);
+ f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info);
_function = std::move(f);
break;
}
@@ -71,25 +80,30 @@
}
Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info)
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- //Configure if the parameters match the direct convolution or the gemm-based
const GPUTarget gpu_target = CLScheduler::get().target();
- switch(CLConvolutionLayer::get_convolution_method(input, weights, biases, output, conv_info, weights_info, gpu_target))
+ switch(CLConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, act_info, gpu_target, dilation, enable_fast_math))
{
+ case ConvolutionMethod::WINOGRAD:
+ {
+ //Validate Winograd
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
+ break;
+ }
case ConvolutionMethod::DIRECT:
{
// Validate direct convolution layer
- CLDirectConvolutionLayer::validate(input, weights, biases, output, conv_info);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
break;
}
case ConvolutionMethod::GEMM:
{
// Validate gemm-based convolution layer
- CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info));
break;
}
default:
@@ -100,21 +114,34 @@
return Status{};
}
-ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const GPUTarget gpu_target)
+ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation, bool enable_fast_math)
{
- ARM_COMPUTE_UNUSED(input);
- ARM_COMPUTE_UNUSED(weights);
- ARM_COMPUTE_UNUSED(biases);
- ARM_COMPUTE_UNUSED(output);
- ARM_COMPUTE_UNUSED(conv_info);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
ARM_COMPUTE_UNUSED(weights_info);
ARM_COMPUTE_UNUSED(gpu_target);
- return ConvolutionMethod::GEMM;
+ const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+
+ if(dilation != Size2D(1U, 1U) || (input->dimension(idx_c) < 16))
+ {
+ return ConvolutionMethod::GEMM;
+ }
+ else
+ {
+ return bool(CLWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+ }
}
void CLConvolutionLayer::run()
{
+ prepare();
_function->run();
}
+
+void CLConvolutionLayer::prepare()
+{
+ _function->prepare();
+}
diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp
new file mode 100644
index 0000000..3442e37
--- /dev/null
+++ b/src/runtime/CL/functions/CLCopy.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLCopy.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLCopy::configure(ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLCopyKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 9e6c0b4..cb8dc02 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -80,7 +80,7 @@
const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(inner_border_right, inner_border_top), info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, info, WeightsInfo()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo()));
return Status{};
}
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 88e9376..676a121 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -24,6 +24,8 @@
#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
@@ -35,17 +37,27 @@
using namespace arm_compute::misc::shape_calculator;
CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3()
- : _kernel(), _border_handler()
+ : _kernel(nullptr), _border_handler()
{
}
-void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+ ActivationLayerInfo act_info)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- _kernel.set_target(CLScheduler::get().target());
- _kernel.configure(input, weights, biases, output, conv_info);
+ if(input->info()->data_layout() == DataLayout::NCHW)
+ {
+ _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
+ }
+ else
+ {
+ _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
+ }
+
+ _kernel->set_target(CLScheduler::get().target());
+ _kernel->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
// Configure border handler
PixelValue &&zero_value(0.f);
@@ -53,42 +65,62 @@
{
zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
}
- _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, zero_value);
+ _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
+}
+
+Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ ActivationLayerInfo act_info, GPUTarget gpu_target)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW && input->data_layout() != DataLayout::NHWC);
+
+ if(input->data_layout() == DataLayout::NCHW)
+ {
+ return CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target);
+ }
+
+ return CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
}
void CLDepthwiseConvolutionLayer3x3::run()
{
CLScheduler::get().enqueue(_border_handler);
- CLScheduler::get().enqueue(_kernel);
+ CLScheduler::get().enqueue(*_kernel);
}
CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
: _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
- _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_quantized(false)
+ _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr)
{
}
-void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
const size_t weights_w = weights->info()->dimension(0);
const size_t weights_h = weights->info()->dimension(1);
const size_t weights_z = weights->info()->dimension(2);
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _is_first_run = true;
+ _original_weights = weights;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
bool append_bias = (biases != nullptr) && !_is_quantized;
const GPUTarget gpu_target = CLScheduler::get().target();
// Calculate output shape
- TensorShape dwc_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info);
+ TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
// Output width and height
- const unsigned int conv_w = dwc_output_shape.x();
- const unsigned int conv_h = dwc_output_shape.y();
+ const unsigned int conv_w = output_shape.x();
+ const unsigned int conv_h = output_shape.y();
// Set up intermediate tensors
const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
@@ -101,7 +133,7 @@
shape_im2col.set(2, weights_z);
_input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
_im2col_kernel.set_target(gpu_target);
- _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias);
+ _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
// Weights reshape configuration
const TensorShape shape_weights_reshape(patch_size, weights_z);
@@ -117,7 +149,7 @@
_v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
_v2mm_kernel.set_target(gpu_target);
_v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
- _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dwc_output_shape));
+ _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
_vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
// Output staged configuration
@@ -152,18 +184,72 @@
_v2mm_output.allocator()->allocate();
}
+Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(2) * depth_multiplier) != weights->dimension(2));
+
+ const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+ const bool append_bias = (biases != nullptr) && !is_quantized;
+ const TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ const size_t weights_w = weights->dimension(0);
+ const size_t weights_h = weights->dimension(1);
+ const size_t weights_z = weights->dimension(2);
+ const unsigned int conv_w = output_shape.x();
+ const unsigned int conv_h = output_shape.y();
+ const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
+ const size_t conv_size = conv_w * conv_h;
+
+ TensorShape shape_im2col = input->tensor_shape();
+ shape_im2col.set(0, patch_size);
+ shape_im2col.set(1, conv_size);
+ shape_im2col.set(2, weights_z);
+ TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+
+ const TensorShape shape_weights_reshape(patch_size, weights_z);
+ TensorInfo weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseWeightsReshapeKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
+
+ DataType v2mm_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+ TensorShape shape_v2mm_out = input->tensor_shape();
+ shape_v2mm_out.set(0, conv_size * weights_z);
+ shape_v2mm_out.set(1, 1);
+ shape_v2mm_out.set(2, 1);
+ TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
+
+ TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
+
+ if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
+ }
+
+ return Status{};
+}
+
void CLDepthwiseConvolutionLayer::run()
{
+ // Run weights reshaping (Runs once for every configure)
+ if(_is_first_run)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ CLScheduler::get().enqueue(_weights_reshape_kernel);
+ CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+ _is_first_run = false;
+
+ // Mark original weights tensor as unused
+ _original_weights->mark_as_unused();
+ }
+
CLScheduler::get().enqueue(_im2col_kernel);
-
- CLScheduler::get().enqueue(_weights_reshape_kernel);
-
CLScheduler::get().enqueue(_v2mm_input_fill_border);
- CLScheduler::get().enqueue(_v2mm_weights_fill_border);
CLScheduler::get().enqueue(_v2mm_kernel);
-
CLScheduler::get().enqueue(_vector_to_tensor_kernel);
-
if(_is_quantized)
{
CLScheduler::get().enqueue(_output_stage_kernel);
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 5559d42..6f33b2e 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
+#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
using namespace arm_compute;
@@ -33,8 +34,18 @@
{
}
+Status CLDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayerKernel::validate(input, output, min_max));
+
+ return Status{};
+}
+
void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *min_max)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
+
_dequantize_kernel.configure(input, output, min_max);
}
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index d6a335c..c451bd4 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,11 +33,11 @@
using namespace arm_compute;
CLDirectConvolutionLayer::CLDirectConvolutionLayer()
- : _direct_conv_kernel(), _input_border_handler()
+ : _direct_conv_kernel(), _input_border_handler(), _activationlayer_function(), _is_activationlayer_enabled(false)
{
}
-void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
{
// Set GPU target
_direct_conv_kernel.set_target(CLScheduler::get().target());
@@ -52,11 +52,28 @@
zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
}
_input_border_handler.configure(input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+
+ // Tune kernels
+ CLScheduler::get().tune_kernel_static(_direct_conv_kernel);
+
+ _is_activationlayer_enabled = act_info.enabled();
+
+ //Configure Activation Layer
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
}
-Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
{
- return CLDirectConvolutionLayerKernel::validate(input, weights, biases, output, conv_info, CLScheduler::get().target());
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerKernel::validate(input, weights, biases, output, conv_info, CLScheduler::get().target()));
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+ }
+ return Status{};
}
void CLDirectConvolutionLayer::run()
@@ -66,4 +83,10 @@
// Run direct convolution
CLScheduler::get().enqueue(_direct_conv_kernel);
+
+ //Run Activation Layer
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
}
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 2b4670b..151fa1b 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -37,10 +37,8 @@
namespace
{
-Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output, bool is_interleaved_transposed)
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
{
- const GPUTarget gpu_target = CLScheduler::get().target();
-
if(is_data_type_quantized_asymmetric(input.data_type()))
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
@@ -55,7 +53,7 @@
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&input, &weights, &output, 1.f, is_interleaved_transposed, GEMMReshapeInfo(), gpu_target));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input, &weights, nullptr, &output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */)));
}
return Status{};
@@ -75,12 +73,12 @@
}
CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _accumulate_biases_kernel(), _im2col_output(),
- _gemmlowp_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false)
+ : _memory_group(memory_manager), _im2col_kernel(), _reshape_weights_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _accumulate_biases_kernel(),
+ _im2col_output(), _gemmlowp_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false), _original_weights(nullptr)
{
}
-void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool is_interleaved_transposed)
+void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
{
if(_is_quantized)
{
@@ -102,8 +100,7 @@
else
{
// Configure matrix multiply kernel
- _mm_kernel.set_target(CLScheduler::get().target());
- _mm_kernel.configure(input, weights, output, 1.f, is_interleaved_transposed);
+ _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */));
}
}
@@ -114,7 +111,7 @@
// If the fully connected layer is called after a convolution layer, the input tensor must be linearized
// Initialize output tensor for im2col
- TensorShape shape_im2col = compute_im2col_shape(input->info());
+ TensorShape shape_im2col = compute_im2col_fc_shape(input->info());
_im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
// Configure im2col kernel
@@ -122,7 +119,7 @@
_im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
// Configure matrix multiply kernel
- configure_mm(&_im2col_output, weights, output, false);
+ configure_mm(&_im2col_output, weights, output);
// Allocate the output tensor for im2col once all the configure methods have been called
_im2col_output.allocator()->allocate();
@@ -133,7 +130,7 @@
ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
// Configure matrix multiply kernel
- configure_mm(input, weights, output, false);
+ configure_mm(input, weights, output);
}
void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
@@ -152,6 +149,7 @@
_is_fc_after_conv = true;
_accumulate_biases = false;
_is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _original_weights = weights;
// Configure gemmlowp output
if(_is_quantized)
@@ -222,13 +220,6 @@
_gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
_gemmlowp_output.allocator()->allocate();
}
-
- // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
- if(!_are_weights_reshaped)
- {
- // Allocate the tensor for the weights reshaped
- _reshape_weights_output.allocator()->allocate();
- }
}
Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose_weights, bool are_weights_reshaped)
@@ -243,7 +234,7 @@
bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
const GPUTarget gpu_target = CLScheduler::get().target();
- const ITensorInfo &im2col_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_shape(input)));
+ const ITensorInfo &im2col_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input)));
const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
const ITensorInfo &gemmlowp_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
@@ -300,7 +291,7 @@
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
}
// Validate matrix multiply kernel
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output, false));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output));
// Validate output stage for asymmetric quantized types
if(is_quantized)
@@ -313,12 +304,7 @@
void CLFullyConnectedLayer::run()
{
- // Reshape of the weights (happens only once)
- if(!_are_weights_reshaped)
- {
- _are_weights_reshaped = true;
- _reshape_weights_kernel.run();
- }
+ prepare();
_memory_group.acquire();
@@ -335,7 +321,7 @@
}
else
{
- CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
+ _mm_gemm.run();
}
// Accumulate biases if provided
@@ -353,3 +339,30 @@
_memory_group.release();
}
+
+void CLFullyConnectedLayer::prepare()
+{
+ // Reshape of the weights (happens only once)
+ if(!_are_weights_reshaped)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run reshape weights kernel and mark weights as unused
+ _reshape_weights_output.allocator()->allocate();
+ _reshape_weights_kernel.run();
+ _original_weights->mark_as_unused();
+
+ // Prepare GEMM prepare and release unused weights
+ if(!_is_quantized)
+ {
+ _mm_gemm.prepare();
+ if(!_reshape_weights_output.is_used())
+ {
+ _reshape_weights_output.allocator()->free();
+ }
+ }
+
+ CLScheduler::get().queue().finish();
+ _are_weights_reshaped = true;
+ }
+}
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 6b5cd2d..f81da6c 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -29,14 +29,18 @@
#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GPUTarget.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/ITensorAllocator.h"
using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
namespace
{
@@ -44,9 +48,10 @@
{
bool flag = true;
- if(gpu_target == GPUTarget::BIFROST)
+ if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
{
- if(k > 256 && m > 4 && data_type == DataType::F32 && reshape_b_only_on_first_run)
+ // COMPMID-852
+ if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
{
const float scale = k < 1024 ? 2.0f : 2.5f;
flag = (scale * n) > ((1.66f * n) + 38.4f);
@@ -56,39 +61,19 @@
flag = false;
}
}
-
- return flag;
-}
-
-Status validate_arguments(const ITensorInfo *a, const ITensorInfo *b, const ICLTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info = GEMMInfo())
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-
- if(c != nullptr)
+ else
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, c->info());
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(c->info()->dimension(0) != output->dimension(0), "The C matrix must have the same number of rows as the output matrix");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(c->info()->dimension(1) != output->dimension(1), "The C matrix must have the same number of columns as the output matrix");
+ // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
+ flag = m != 1 && reshape_b_only_on_first_run;
}
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-
- ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_UNUSED(beta);
- return Status{};
+ return flag;
}
} // namespace
CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false),
- _is_first_run(true), _reshape_b_only_on_first_run(false)
+ : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr), _is_interleaved_transposed(false),
+ _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
{
}
@@ -97,10 +82,14 @@
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
// Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(a->info(), b->info(), c, output->info(), alpha, beta, gemm_info));
+ ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
+
+ // Store original b matrix
+ _original_b = b;
// Check if we need to reshape the matrix B only on the first run
_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+ _is_prepared = false;
const ICLTensor *matrix_a = a;
const ICLTensor *matrix_b = b;
@@ -121,7 +110,7 @@
int mult_transpose1xW_width = 1;
int mult_interleave4x4_height = 1;
- if(gpu_target == GPUTarget::BIFROST)
+ if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
{
mult_transpose1xW_width = 4;
mult_interleave4x4_height = 2;
@@ -137,8 +126,10 @@
// Manage intermediate buffers
_memory_group.manage(&_tmp_a);
- _memory_group.manage(&_tmp_b);
-
+ if(!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_tmp_b);
+ }
// _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
// Configure interleave kernel
@@ -154,7 +145,10 @@
{
// Allocate intermediate tensors
_tmp_a.allocator()->allocate();
- _tmp_b.allocator()->allocate();
+ if(!_reshape_b_only_on_first_run)
+ {
+ _tmp_b.allocator()->allocate();
+ }
}
// Configure matrix addition kernel
@@ -165,14 +159,74 @@
}
}
-Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ICLTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info)
+Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(a, b, c, output, alpha, beta, gemm_info));
+ ARM_COMPUTE_UNUSED(alpha);
+
+ // Check if we need to reshape the matrix B only on the first run
+ const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+
+ const ITensorInfo *matrix_a_info = a;
+ const ITensorInfo *matrix_b_info = b;
+
+ TensorInfo tmp_a_info{};
+ TensorInfo tmp_b_info{};
+ TensorInfo tmp_output_info = *output->clone();
+
+ // Get the GPU target
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
+ // Arguments used by GEMMReshapeInfo
+ // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
+ // in order to know how the matrices have been reshaped
+ const int m = a->dimension(1);
+ const int n = b->dimension(0);
+ const int k = a->dimension(0);
+ int mult_transpose1xW_width = 1;
+ int mult_interleave4x4_height = 1;
+
+ if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
+ {
+ mult_transpose1xW_width = 4;
+ mult_interleave4x4_height = 2;
+ }
+
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height);
+
+ // Check if we need to reshape the matrix A and matrix B
+ const bool run_interleave_transpose = is_interleaved_transposed(m, n, k, a->data_type(), reshape_b_only_on_first_run, gpu_target);
+
+ if(run_interleave_transpose)
+ {
+ matrix_a_info = &tmp_a_info;
+ matrix_b_info = &tmp_b_info;
+
+ // Validate interleave kernel
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height));
+
+ // Validate transpose kernel
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width));
+ }
+
+ // Validate matrix multiply
+ auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info, gpu_target));
+
+ if(beta != 0 && c != nullptr)
+ {
+ // Validate matrix addition kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, &tmp_output_info, beta));
+ }
+
return Status{};
}
void CLGEMM::run()
{
+ prepare();
+
_memory_group.acquire();
if(_is_interleaved_transposed)
@@ -180,14 +234,7 @@
// Run interleave kernel
CLScheduler::get().enqueue(_interleave_kernel, false);
- if(_is_first_run)
- {
- // Run transpose kernel
- CLScheduler::get().enqueue(_transpose_kernel, false);
-
- _is_first_run = false;
- }
- else if(!_reshape_b_only_on_first_run)
+ if(!_reshape_b_only_on_first_run)
{
// Run transpose kernel
CLScheduler::get().enqueue(_transpose_kernel, false);
@@ -205,3 +252,19 @@
_memory_group.release();
}
+
+void CLGEMM::prepare()
+{
+ if(!_is_prepared)
+ {
+ if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+ {
+ // Run transpose kernel
+ _tmp_b.allocator()->allocate();
+ CLScheduler::get().enqueue(_transpose_kernel, false);
+ _original_b->mark_as_unused();
+ }
+ CLScheduler::get().queue().finish();
+ _is_prepared = true;
+ }
+}
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index c58af36..79495e4 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -38,8 +38,8 @@
using namespace arm_compute;
using namespace arm_compute::misc::shape_calculator;
-CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped()
+CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
+ : _weights_reshape_kernel()
{
}
@@ -86,16 +86,12 @@
void CLConvolutionLayerReshapeWeights::run()
{
- _memory_group.acquire();
-
CLScheduler::get().enqueue(_weights_reshape_kernel);
-
- _memory_group.release();
}
CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _im2col_output(),
- _interleave_output(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_first_run(true)
+ : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
+ _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
{
}
@@ -155,7 +151,8 @@
return Status{};
}
-void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+ const Size2D &dilation, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
@@ -164,9 +161,13 @@
biases != nullptr ? biases->info() : nullptr,
output->info(),
conv_info,
- weights_info));
+ weights_info,
+ dilation,
+ act_info));
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _is_prepared = false;
+ _original_weights = weights;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
const DataType dt = input->info()->data_type();
@@ -191,7 +192,7 @@
const unsigned int kernel_width = weights->info()->dimension(0);
const unsigned int kernel_height = weights->info()->dimension(1);
std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
- conv_info);
+ conv_info, dilation);
unsigned int mat_weights_cols = weights->info()->dimension(3);
unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
@@ -226,7 +227,7 @@
_memory_group.manage(&_gemm_output);
// Configure im2col
- _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias);
+ _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation);
// Configure GEMM
configure_mm(&_im2col_output, weights, &_gemm_output);
@@ -255,14 +256,19 @@
ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
- // Allocate intermediate tensor
- _weights_reshaped.allocator()->allocate();
+ //Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
ARM_COMPUTE_UNUSED(weights_info);
}
Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info)
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
@@ -272,6 +278,11 @@
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a());
+ }
+
const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
const bool append_bias = (biases != nullptr) && (!is_quantized);
const unsigned bias_element = (append_bias) ? 1 : 0;
@@ -284,12 +295,12 @@
const unsigned int kernel_width = weights->dimension(0);
const unsigned int kernel_height = weights->dimension(1);
- std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height, conv_info);
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height, conv_info, dilation);
unsigned int mat_weights_cols = weights->dimension(3);
unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + bias_element;
- CLConvolutionLayerReshapeWeights::validate(weights, biases, nullptr);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized ? nullptr : biases, nullptr));
// Create tensor info for im2col reshaped inputs
const unsigned int mat_input_cols = mat_weights_rows;
@@ -300,7 +311,7 @@
shape_im2col.set(2, 1);
TensorInfo im2col_reshaped_info(shape_im2col, 1, dt, input->fixed_point_position());
im2col_reshaped_info.set_quantization_info(input->quantization_info());
- CLIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
// Create GEMM output tensor
TensorShape shape_gemm = im2col_reshaped_info.tensor_shape();
@@ -311,9 +322,10 @@
TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->fixed_point_position());
info_gemm.set_quantization_info(output->quantization_info());
- validate_mm(&im2col_reshaped_info, weights, &info_gemm);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(&im2col_reshaped_info, weights, &info_gemm));
+ TensorInfo tmp_info(shape_gemm, 1, DataType::QASYMM8, input->fixed_point_position());
+ tmp_info.set_quantization_info(output->quantization_info());
- TensorInfo tmp_info(input->tensor_shape(), 1, DataType::QASYMM8, input->fixed_point_position());
if(is_quantized)
{
float multiplier = input->quantization_info().scale * weights->quantization_info().scale / output->quantization_info().scale;
@@ -324,7 +336,7 @@
}
// Validate Col2Im
- CLCol2ImKernel::validate(is_quantized ? &tmp_info : &info_gemm, output, std::make_pair(conv_w, conv_h));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(is_quantized ? &tmp_info : &info_gemm, output, std::make_pair(conv_w, conv_h)));
if(biases != nullptr)
{
@@ -341,18 +353,18 @@
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
}
+ //Validate Activation Layer
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+ }
+
return Status{};
}
void CLGEMMConvolutionLayer::run()
{
- // Run weights reshaping (Runs once for every configure)
- if(_is_first_run)
- {
- _reshape_weights.run();
-
- _is_first_run = false;
- }
+ prepare();
_memory_group.acquire();
@@ -377,5 +389,36 @@
// Reshape output matrix
CLScheduler::get().enqueue(_col2im_kernel, false);
+ //Run Activation Layer if enabled
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
+
_memory_group.release();
}
+
+void CLGEMMConvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ // Run weights reshaping and mark as unused
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+ _weights_reshaped.allocator()->allocate();
+ _reshape_weights.run();
+ _original_weights->mark_as_unused();
+
+ // Run GEMM prepare
+ if(!_is_quantized)
+ {
+ _mm_gemm.prepare();
+ if(!_weights_reshaped.is_used())
+ {
+ _weights_reshaped.allocator()->free();
+ }
+ }
+
+ CLScheduler::get().queue().finish();
+ _is_prepared = true;
+ }
+}
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index c688299..711b006 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -41,7 +41,7 @@
{
bool flag = true;
- if(gpu_target == GPUTarget::BIFROST)
+ if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
{
// COMPMID-852
if(k > 256 && m > 4 && reshape_b_only_on_first_run)
@@ -102,7 +102,10 @@
matrix_b = &_tmp_b;
_memory_group.manage(&_tmp_a);
- _memory_group.manage(&_tmp_b);
+ if(!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_tmp_b);
+ }
// Configure interleave kernel
_mtx_a_reshape_kernel.configure(a, &_tmp_a, mult_interleave4x4_height);
@@ -119,7 +122,10 @@
{
TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
_vector_sum_col.allocator()->init(info_vector_sum_col);
- _memory_group.manage(&_vector_sum_col);
+ if(!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_vector_sum_col);
+ }
// Configure Matrix B reduction kernel
_mtx_b_reduction_kernel.configure(b, &_vector_sum_col);
diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
index 4b32954..ddce5fb 100644
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -49,7 +49,8 @@
}
CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT
- : _border_handler(),
+ : _horizontal_border_handler(),
+ _vertical_border_handler(),
_horizontal_reduction(),
_vertical_reduction()
{
@@ -64,6 +65,9 @@
ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale());
+ // Constant value to use for vertical fill border when the border mode is CONSTANT
+ const uint16_t pixel_value_u16 = static_cast<uint16_t>(constant_border_value) * 2 + static_cast<uint16_t>(constant_border_value) * 8 + static_cast<uint16_t>(constant_border_value) * 6;
+
/* Get number of pyramid levels */
const size_t num_levels = pyramid->info()->num_levels();
@@ -72,28 +76,31 @@
if(num_levels > 1)
{
- _border_handler = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
- _horizontal_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
- _vertical_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
+ _horizontal_border_handler = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
+ _vertical_border_handler = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
+ _horizontal_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
+ _vertical_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
// Apply half scale to the X dimension of the tensor shape
TensorShape tensor_shape = pyramid->info()->tensor_shape();
tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::U16);
-
_tmp.init(pyramid_info);
for(size_t i = 0; i < num_levels - 1; ++i)
{
/* Configure horizontal kernel */
- _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode == BorderMode::UNDEFINED);
+ _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
/* Configure vertical kernel */
- _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), border_mode == BorderMode::UNDEFINED);
+ _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
/* Configure border */
- _border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+ _horizontal_border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+
+ /* Configure border */
+ _vertical_border_handler[i].configure(_tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
}
_tmp.allocate();
}
@@ -110,13 +117,15 @@
_pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */);
_input->map(CLScheduler::get().queue(), true /* blocking */);
_pyramid->get_pyramid_level(0)->copy_from(*_input);
+
_input->unmap(CLScheduler::get().queue());
_pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue());
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
- CLScheduler::get().enqueue(_border_handler[i], false);
+ CLScheduler::get().enqueue(_horizontal_border_handler[i], false);
CLScheduler::get().enqueue(_horizontal_reduction[i], false);
+ CLScheduler::get().enqueue(_vertical_border_handler[i], false);
CLScheduler::get().enqueue(_vertical_reduction[i], false);
}
}
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index d1bb65f..a3010a7 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -52,6 +52,26 @@
_sumsq.allocator()->allocate();
}
+Status CLL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, float epsilon)
+{
+ TensorShape shape(input->tensor_shape());
+
+ // Create intermediate tensor info
+ TensorInfo sum_sq;
+ sum_sq.set_data_type(input->data_type());
+ sum_sq.set_tensor_shape(shape);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, axis, ReductionOperation::SUM_SQUARE));
+
+ // Reduce shape on axis (supported axis is 0)
+ shape.set(0, 1);
+ sum_sq.set_tensor_shape(shape);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLL2NormalizeLayerKernel::validate(input, &sum_sq, output, axis, epsilon));
+
+ return Status{};
+}
+
void CLL2NormalizeLayer::run()
{
_memory_group.acquire();
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
new file mode 100644
index 0000000..930d311
--- /dev/null
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -0,0 +1,508 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _gemm_input_gate1(), _gemm_input_gate2(), _transpose_input_gate1(), _transpose_input_gate2(), _accum_input_gate1(),
+ _accum_input_gate2(), _subtract_input_gate(), _activation_input_gate(), _fully_connected_forget_gate(), _gemm_forget_gate1(), _gemm_forget_gate2(), _transpose_forget_gate1(),
+ _transpose_forget_gate2(), _accum_forget_gate1(), _accum_forget_gate2(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _gemm_cell_state2(), _transpose_cell_state1(),
+ _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output1(),
+ _gemm_output2(), _transpose_output1(), _transpose_output2(), _accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state(),
+ _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(), _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _input_gate_out1(), _input_gate_out2(),
+ _input_gate_out3(), _input_gate_out4(), _input_gate_out5(), _input_gate_out6(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(),
+ _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _output5(), _output6(),
+ _cell_state_activation(), _output_projection1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false),
+ _perform_projection_clipping(false)
+{
+}
+
+void CLLSTMLayer::configure(const ICLTensor *input, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
+ const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
+ const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
+ ICLTensor *output_state, ICLTensor *cell_state, ICLTensor *scratch_buffer, ICLTensor *output, const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info,
+ float cell_threshold, float projection_threshold)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ forget_gate_bias, cell_bias, output_gate_bias, output_state, cell_state);
+ LSTMParams<ITensorInfo> lstm_params_info;
+ if(lstm_params.has_peephole_opt())
+ {
+ lstm_params_info.set_peephole_params(lstm_params.cell_to_input_weights()->info(), lstm_params.cell_to_forget_weights()->info(), lstm_params.cell_to_output_weights()->info());
+ }
+ if(lstm_params.has_projection())
+ {
+ lstm_params_info.set_projection_params(lstm_params.projection_weights()->info(), lstm_params.projection_bias()->info());
+ }
+ if(!lstm_params.has_cifg_opt())
+ {
+ lstm_params_info.set_cifg_params(lstm_params.input_to_input_weights()->info(), lstm_params.recurrent_to_input_weights()->info(),
+ lstm_params.cell_to_input_weights()->info(), lstm_params.input_gate_bias()->info());
+ }
+ ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(input->info(), input_to_forget_weights->info(),
+ input_to_cell_weights->info(), input_to_output_weights->info(),
+ recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+ forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+ output_state->info(), cell_state->info(), scratch_buffer->info(), output->info(), lstm_params_info,
+ activation_info, cell_threshold, projection_threshold));
+
+ const TensorShape cell_state_shape = cell_state->info()->tensor_shape();
+
+ TensorShape forget_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+ TensorShape forget_gate2_shape = compute_transposed_shape(*forget_gate_bias->info());
+ TensorShape forget_gate3_shape{ 1, output_state->info()->dimension(1) };
+ _forget_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _forget_gate_out2.allocator()->init(TensorInfo(forget_gate1_shape, 1, input->info()->data_type()));
+ _forget_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _forget_gate_out6.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ // Configure block that calculates the forget gate
+ // forget_gate = Activation(input * input_to_forget_weights + output_state * recurrent_to_forget_weights + cell_state * cell_to_forget_weights + forget_gate_bias)
+ _memory_group.manage(&_forget_gate_out1);
+ _fully_connected_forget_gate.configure(input, input_to_forget_weights, forget_gate_bias, &_forget_gate_out1, true, false);
+ _memory_group.manage(&_forget_gate_out2);
+ _transpose_forget_gate1.configure(recurrent_to_forget_weights, &_forget_gate_out2);
+ _memory_group.manage(&_forget_gate_out3);
+ _gemm_forget_gate1.configure(output_state, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
+ _forget_gate_out2.allocator()->allocate();
+ _memory_group.manage(&_forget_gate_out6);
+ _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out6, ConvertPolicy::SATURATE);
+ CLTensor *forget_gate_out = &_forget_gate_out6;
+
+ if(lstm_params.has_peephole_opt())
+ {
+ _forget_gate_out4.allocator()->init(TensorInfo(forget_gate2_shape, 1, input->info()->data_type()));
+ _forget_gate_out5.allocator()->init(TensorInfo(forget_gate3_shape, 1, input->info()->data_type()));
+
+ _run_peephole_opt = true;
+ _memory_group.manage(&_forget_gate_out4);
+ _transpose_forget_gate2.configure(lstm_params.cell_to_forget_weights(), &_forget_gate_out4);
+ _memory_group.manage(&_forget_gate_out5);
+ _gemm_forget_gate2.configure(cell_state, &_forget_gate_out4, nullptr, &_forget_gate_out5, 1.f, 0.f);
+ _forget_gate_out4.allocator()->allocate();
+ _accum_forget_gate2.configure(&_forget_gate_out6, &_forget_gate_out5, &_forget_gate_out3, ConvertPolicy::SATURATE);
+ _forget_gate_out5.allocator()->allocate();
+ _forget_gate_out6.allocator()->allocate();
+ forget_gate_out = &_forget_gate_out3;
+ }
+ else
+ {
+ _forget_gate_out3.allocator()->allocate();
+ }
+ _activation_forget_gate.configure(forget_gate_out, &_forget_gate_out1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ forget_gate_out->allocator()->allocate();
+
+ TensorShape input_gate3_shape{ 1, output_state->info()->dimension(1) };
+ _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _input_gate_out5.allocator()->init(TensorInfo(input_gate3_shape, 1, input->info()->data_type()));
+
+ // Configure block that calculates the input gate
+ // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + cell_state * cell_to_input_weights + input_gate_bias), without CIFG
+ // input_gate = 1 - forget_gate, with CIFG
+ if(lstm_params.has_cifg_opt())
+ {
+ _memory_group.manage(&_input_gate_out1);
+ _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _subtract_input_gate.configure(&_ones, &_forget_gate_out1, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _ones.allocator()->allocate();
+ _run_cifg_opt = true;
+ }
+ else
+ {
+ TensorShape input_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+ TensorShape input_gate2_shape = compute_transposed_shape(*lstm_params.cell_to_input_weights()->info());
+
+ _input_gate_out2.allocator()->init(TensorInfo(input_gate1_shape, 1, input->info()->data_type()));
+ _input_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _input_gate_out4.allocator()->init(TensorInfo(input_gate2_shape, 1, input->info()->data_type()));
+ _input_gate_out6.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ _memory_group.manage(&_input_gate_out1);
+ _fully_connected_input_gate.configure(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &_input_gate_out1, true, false);
+ _memory_group.manage(&_input_gate_out2);
+ _transpose_input_gate1.configure(lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
+ _memory_group.manage(&_input_gate_out3);
+ _gemm_input_gate1.configure(output_state, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
+ _input_gate_out2.allocator()->allocate();
+ _memory_group.manage(&_input_gate_out4);
+ _transpose_input_gate2.configure(lstm_params.cell_to_input_weights(), &_input_gate_out4);
+ _memory_group.manage(&_input_gate_out5);
+ _gemm_input_gate2.configure(cell_state, &_input_gate_out4, nullptr, &_input_gate_out5, 1.f, 0.f);
+ _input_gate_out4.allocator()->allocate();
+ _memory_group.manage(&_input_gate_out6);
+ _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out6, ConvertPolicy::SATURATE);
+ _input_gate_out3.allocator()->allocate();
+ _accum_input_gate2.configure(&_input_gate_out6, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _input_gate_out5.allocator()->allocate();
+ _input_gate_out6.allocator()->allocate();
+ _activation_input_gate.configure(&_input_gate_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ }
+
+ TensorShape cell_state1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+ _cell_state_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _cell_state_out2.allocator()->init(TensorInfo(cell_state1_shape, 1, input->info()->data_type()));
+ _cell_state_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _cell_state_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ // Configure block that calculates the cell state
+ // cell_state = Clip((RixelwiseMul(input_gate, Activation(input * input_to_cell_weights + output_state * recurrent_to_cell_weights + cell_bias)) + PixelwiseMul(forget_gate, cell_state)), cell_threshold)
+ _memory_group.manage(&_cell_state_out1);
+ _fully_connected_cell_state.configure(input, input_to_cell_weights, cell_bias, &_cell_state_out1, true, false);
+ _memory_group.manage(&_cell_state_out2);
+ _transpose_cell_state1.configure(recurrent_to_cell_weights, &_cell_state_out2);
+ _memory_group.manage(&_cell_state_out3);
+ _gemm_cell_state1.configure(output_state, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
+ _cell_state_out2.allocator()->allocate();
+ _memory_group.manage(&_cell_state_out4);
+ _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+ _activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
+ _memory_group.manage(&_cell_state_out5);
+ _pixelwise_mul_cell_state1.configure(&_cell_state_out4, &_input_gate_out1, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _input_gate_out1.allocator()->allocate();
+ _cell_state_out4.allocator()->allocate();
+ _pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _forget_gate_out1.allocator()->allocate();
+ _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+ _cell_state_out3.allocator()->allocate();
+ _cell_state_out5.allocator()->allocate();
+
+ // Perform clipping
+ if(cell_threshold != 0.f)
+ {
+ _perform_cell_clipping = true;
+ _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold));
+ }
+
+ TensorShape output1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+ TensorShape output2_shape = compute_transposed_shape(*cell_bias->info());
+ TensorShape output3_shape{ 1, output_state->info()->dimension(1) };
+ _output1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _output2.allocator()->init(TensorInfo(output1_shape, 1, input->info()->data_type()));
+ _output3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _output6.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ // Configure block that calculates the output
+ // output_gate = Activation(input * input_to_output_weights + output_state * recurrent_to_output_weights + cell_state * cell_to_output_weights + output_gate_bias)
+ _memory_group.manage(&_output1);
+ _fully_connected_output.configure(input, input_to_output_weights, output_gate_bias, &_output1, true, false);
+ _memory_group.manage(&_output2);
+ _transpose_output1.configure(recurrent_to_output_weights, &_output2);
+ _memory_group.manage(&_output3);
+ _gemm_output1.configure(output_state, &_output2, nullptr, &_output3, 1.f, 0.f);
+ _output2.allocator()->allocate();
+ _memory_group.manage(&_output6);
+ _accum_output1.configure(&_output1, &_output3, &_output6, ConvertPolicy::SATURATE);
+ _output3.allocator()->allocate();
+ CLTensor *output_gate_out = &_output6;
+ if(lstm_params.has_peephole_opt())
+ {
+ _output4.allocator()->init(TensorInfo(output2_shape, 1, input->info()->data_type()));
+ _output5.allocator()->init(TensorInfo(output3_shape, 1, input->info()->data_type()));
+
+ _memory_group.manage(&_output4);
+ _transpose_output2.configure(lstm_params.cell_to_output_weights(), &_output4);
+ _memory_group.manage(&_output5);
+ _gemm_output2.configure(&_cell_state_out1, &_output4, nullptr, &_output5, 1.f, 0.f);
+ _accum_output2.configure(&_output6, &_output5, &_output1, ConvertPolicy::SATURATE);
+ _output6.allocator()->allocate();
+ output_gate_out = &_output1;
+
+ // Allocate intermediate buffers
+ _output4.allocator()->allocate();
+ _output5.allocator()->allocate();
+ }
+ else
+ {
+ _output1.allocator()->allocate();
+ }
+ _activation_output.configure(output_gate_out, output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ output_gate_out->allocator()->allocate();
+
+ _cell_state_activation.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+ // Configure block that calculates the output state
+ /** lstm_res = PixelwiseMul(output, Activation(cell_state))
+ *
+ * -- Clip(lstm_res * projection_weights + projection_bias, projection_threshold) , if there is a projection
+ * /
+ * output_state = --
+ * \
+ * -- lstm_res , otherwise
+ */
+ _memory_group.manage(&_cell_state_activation);
+ _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
+ _pixelwise_mul_output_state.configure(&_cell_state_activation, output, output_state, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _cell_state_activation.allocator()->allocate();
+
+ if(lstm_params.has_projection())
+ {
+ _has_projection_weights = true;
+ _output_projection1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _memory_group.manage(&_output_projection1);
+ _fully_connected_output_state.configure(output_state, lstm_params.projection_weights(), lstm_params.projection_bias(), &_output_projection1, true, false);
+ // Perform clipping
+ if(projection_threshold != 0.f)
+ {
+ _perform_projection_clipping = true;
+ _projection_clip.configure(&_output_projection1, output_state, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+ }
+
+ // Allocate intermediate buffer
+ _output_projection1.allocator()->allocate();
+ }
+
+ // Copy cell state and output
+ _copy_cell_state.configure(&_cell_state_out1, cell_state);
+ _cell_state_out1.allocator()->allocate();
+ _copy_output.configure(output_state, output);
+
+ // Vector for holding the tensors to store in scratch buffer
+ std::vector<ICLTensor *> scratch_inputs;
+ if(lstm_params.has_cifg_opt())
+ {
+ scratch_inputs.emplace_back(&_input_gate_out1);
+ }
+ scratch_inputs.emplace_back(&_cell_state_out1);
+ scratch_inputs.emplace_back(forget_gate_out);
+ scratch_inputs.emplace_back(output_gate_out);
+ _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
+}
+
+Status CLLSTMLayer::validate(const ITensorInfo *input, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
+ const ITensorInfo *output_state, const ITensorInfo *cell_state, const ITensorInfo *scratch_buffer, const ITensorInfo *output,
+ const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ forget_gate_bias, cell_bias, output_gate_bias, output_state, cell_state);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state, cell_state);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(input_to_forget_weights->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(input_to_cell_weights->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_forget_weights->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_cell_weights->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_gate_bias->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_state->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(cell_state->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(scratch_buffer->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+
+ if(lstm_params.has_peephole_opt())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights(), lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_output_weights()->num_dimensions() != 1);
+ }
+
+ TensorShape units_out_transposed_shape = compute_transposed_shape(*recurrent_to_output_weights);
+ TensorShape gemmv_shape{ 1, output_state->dimension(1) };
+ TensorShape num_units_transposed_shape = compute_transposed_shape(*forget_gate_bias);
+ const TensorInfo units_out_transposed_info = TensorInfo(units_out_transposed_shape, 1, input->data_type());
+ const TensorInfo gemmv_shape_info = TensorInfo(gemmv_shape, 1, input->data_type());
+ const TensorInfo num_units_transposed_info = TensorInfo(num_units_transposed_shape, 1, input->data_type());
+
+ // Validate forget gate
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, cell_state, true, false));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state, &units_out_transposed_info, nullptr, cell_state, 1.f, 0.f, GEMMInfo()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(cell_state, cell_state, cell_state, ConvertPolicy::SATURATE));
+ if(lstm_params.has_peephole_opt())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(cell_state, &num_units_transposed_info, nullptr, &gemmv_shape_info, 1.f, 0.f, GEMMInfo()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(cell_state, &gemmv_shape_info, cell_state, ConvertPolicy::SATURATE));
+ }
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, cell_state, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+
+ // Validate input gate
+ if(!lstm_params.has_cifg_opt())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.cell_to_input_weights(), lstm_params.input_gate_bias());
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() != 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), cell_state, true, false));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(cell_state, &num_units_transposed_info, nullptr, &gemmv_shape_info, 1.f, 0.f, GEMMInfo()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(cell_state, &gemmv_shape_info, cell_state, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtractionKernel::validate(cell_state, cell_state, cell_state, ConvertPolicy::SATURATE));
+ }
+
+ // Validate cell state
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, cell_bias, cell_state, true, false));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, nullptr, activation_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state, cell_state, cell_state, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+
+ if(cell_threshold != 0.f)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold)));
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, cell_state, true, false));
+ if(lstm_params.has_peephole_opt())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(cell_state, cell_state, cell_state, ConvertPolicy::SATURATE));
+ }
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+
+ // Validate output state
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, cell_state, activation_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state, output, output_state, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+ if(lstm_params.has_projection())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(output_state, lstm_params.projection_weights(), lstm_params.projection_bias(), cell_state, true, false));
+ if(projection_threshold != 0.f)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, output_state, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold,
+ projection_threshold)));
+ }
+ }
+
+ std::vector<TensorInfo> inputs_vector_info;
+ if(lstm_params.has_cifg_opt())
+ {
+ inputs_vector_info.emplace_back(*cell_state);
+ }
+ inputs_vector_info.emplace_back(*cell_state);
+ inputs_vector_info.emplace_back(*cell_state);
+ inputs_vector_info.emplace_back(*cell_state);
+
+ std::vector<ITensorInfo *> inputs_vector_info_raw;
+ for(auto &input : inputs_vector_info)
+ {
+ inputs_vector_info_raw.emplace_back(&input);
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer));
+ return Status{};
+}
+
+void CLLSTMLayer::run()
+{
+ _memory_group.acquire();
+
+ _fully_connected_forget_gate.run();
+ CLScheduler::get().enqueue(_transpose_forget_gate1);
+ _gemm_forget_gate1.run();
+ CLScheduler::get().enqueue(_accum_forget_gate1);
+
+ if(_run_peephole_opt)
+ {
+ CLScheduler::get().enqueue(_transpose_forget_gate2);
+ _gemm_forget_gate2.run();
+ _accum_forget_gate2.run();
+ }
+ CLScheduler::get().enqueue(_activation_forget_gate);
+
+ if(_run_cifg_opt)
+ {
+ _ones.map(true);
+ std::fill_n(_ones.buffer(), _ones.info()->total_size(), 1);
+ _ones.unmap();
+ CLScheduler::get().enqueue(_subtract_input_gate);
+ }
+ else
+ {
+ _fully_connected_input_gate.run();
+ CLScheduler::get().enqueue(_transpose_input_gate1);
+ _gemm_input_gate1.run();
+ CLScheduler::get().enqueue(_transpose_input_gate2);
+ _gemm_input_gate2.run();
+ CLScheduler::get().enqueue(_accum_input_gate1);
+ _accum_input_gate2.run();
+ CLScheduler::get().enqueue(_activation_input_gate);
+ }
+
+ _fully_connected_cell_state.run();
+ CLScheduler::get().enqueue(_transpose_cell_state1);
+ _gemm_cell_state1.run();
+ CLScheduler::get().enqueue(_accum_cell_state1);
+ CLScheduler::get().enqueue(_activation_cell_state);
+ CLScheduler::get().enqueue(_pixelwise_mul_cell_state1);
+ CLScheduler::get().enqueue(_pixelwise_mul_cell_state2);
+ CLScheduler::get().enqueue(_accum_cell_state2);
+
+ if(_perform_cell_clipping)
+ {
+ CLScheduler::get().enqueue(_cell_clip);
+ }
+
+ _fully_connected_output.run();
+ CLScheduler::get().enqueue(_transpose_output1);
+ _gemm_output1.run();
+ CLScheduler::get().enqueue(_accum_output1);
+ CLScheduler::get().enqueue(_pixelwise_mul_output_state);
+
+ if(_run_peephole_opt)
+ {
+ CLScheduler::get().enqueue(_transpose_output2);
+ _gemm_output2.run();
+ _accum_output2.run();
+ }
+ CLScheduler::get().enqueue(_activation_output);
+
+ CLScheduler::get().enqueue(_activation_output_state);
+ CLScheduler::get().enqueue(_pixelwise_mul_output_state);
+
+ if(_has_projection_weights)
+ {
+ _fully_connected_output_state.run();
+ if(_perform_projection_clipping)
+ {
+ CLScheduler::get().enqueue(_projection_clip);
+ }
+ }
+
+ CLScheduler::get().enqueue(_copy_cell_state);
+ CLScheduler::get().enqueue(_copy_output);
+
+ _concat_scratch_buffer.run();
+
+ _memory_group.release();
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 9120aad..986fe00 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -33,72 +33,120 @@
using namespace arm_compute;
-CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
- _is_first_run(false)
+namespace
{
-}
-
-void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ TensorShape &shape_wr, TensorShape &shape_im2col, TensorShape &shape_gemm)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
- ARM_COMPUTE_ERROR_ON(!conv_info.padding_is_symmetric());
+ ARM_COMPUTE_UNUSED(output);
- if(biases != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
- ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
- }
+ const unsigned int kernel_width = weights->dimension(0);
+ const unsigned int kernel_height = weights->dimension(1);
- bool _has_bias = (biases != nullptr);
- _is_first_run = true;
-
- // Get parameters for conv_info
- unsigned int stride_x = 0;
- unsigned int stride_y = 0;
- unsigned int pad_x = 0;
- unsigned int pad_y = 0;
- std::tie(stride_x, stride_y) = conv_info.stride();
- std::tie(pad_x, pad_y) = conv_info.pad();
+ bool has_bias = (biases != nullptr);
// Get convolved dimensions
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
conv_info);
- ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
- ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+ const size_t mat_weights_cols = weights->dimension(3);
+ const size_t mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + ((has_bias) ? 1 : 0);
+ const size_t mat_weights_num = weights->dimension(4);
- // Create tensor to store the reshaped weights
- const size_t mat_weights_cols = weights->info()->dimension(3);
- const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
- const size_t mat_weights_num = weights->info()->dimension(4);
+ shape_wr = TensorShape(mat_weights_cols, mat_weights_rows, mat_weights_num);
- const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
-
- _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
-
- // Create tensor to store im2col reshaped inputs
const size_t mat_input_cols = mat_weights_rows;
const size_t mat_input_rows = conv_w * conv_h;
- TensorShape shape_im2col = input->info()->tensor_shape();
+
+ shape_im2col = input->tensor_shape();
shape_im2col.set(0, mat_input_cols);
shape_im2col.set(1, mat_input_rows);
shape_im2col.set(2, 1);
- _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
-
- // Create locally connected layer output tensor
- TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+ shape_gemm = shape_im2col;
shape_gemm.set(0, mat_weights_cols);
shape_gemm.set(1, mat_input_rows);
+}
+} // namespace
+
+CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
+ _is_first_run(false), _original_weights(nullptr)
+{
+}
+
+Status CLLocallyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
+ ARM_COMPUTE_RETURN_ERROR_ON(!conv_info.padding_is_symmetric());
+
+ bool has_bias = (biases != nullptr);
+
+ if(has_bias)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 2);
+ }
+
+ const unsigned int kernel_width = weights->dimension(0);
+ const unsigned int kernel_height = weights->dimension(1);
+
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
+ conv_info);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != conv_w) || (output->dimension(1) != conv_h), "Output shape does not match the expected one");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+ // Calculate intermediate buffer shapes
+ TensorShape shape_wr;
+ TensorShape shape_im2col;
+ TensorShape shape_gemm;
+ calculate_shapes(input, weights, biases, output, conv_info, shape_wr, shape_im2col, shape_gemm);
+
+ TensorInfo weights_reshaped_info(shape_wr, 1, weights->data_type());
+ TensorInfo input_im2col_reshaped_info(shape_im2col, 1, input->data_type());
+ TensorInfo gemm_output_info(shape_gemm, 1, input->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLLocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(&gemm_output_info, output, std::make_pair(conv_w, conv_h)));
+
+ return Status{};
+}
+
+void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_THROW_ON(CLLocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info));
+
+ bool _has_bias = (biases != nullptr);
+ _original_weights = weights;
+ _is_first_run = true;
+
+ const unsigned int kernel_width = weights->info()->dimension(0);
+ const unsigned int kernel_height = weights->info()->dimension(1);
+
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
+ conv_info);
+
+ // Calculate intermediate buffer shapes
+ TensorShape shape_wr;
+ TensorShape shape_im2col;
+ TensorShape shape_gemm;
+ calculate_shapes(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info, shape_wr, shape_im2col, shape_gemm);
+
+ _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
+ _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
_gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
// Manage intermediate buffers
@@ -106,7 +154,7 @@
_memory_group.manage(&_gemm_output);
// Configure kernels
- _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(conv_w, conv_h), conv_info, _has_bias);
+ _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
_weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
_mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
_output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
@@ -122,8 +170,13 @@
// Run weights reshaping (Runs once for every configure)
if(_is_first_run)
{
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
_is_first_run = false;
CLScheduler::get().enqueue(_weights_reshape_kernel);
+
+ // Mark original weights tensor as unused
+ _original_weights->mark_as_unused();
}
_memory_group.acquire();
diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp
index 146856c..55b7649 100644
--- a/src/runtime/CL/functions/CLPermute.cpp
+++ b/src/runtime/CL/functions/CLPermute.cpp
@@ -39,6 +39,6 @@
Status CLPermute::validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
{
- ARM_COMPUTE_RETURN_ERROR_ON(CLPermuteKernel::validate(input, output, perm));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(input, output, perm));
return Status{};
-}
\ No newline at end of file
+}
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 201bf87..17875a3 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -41,13 +41,28 @@
_kernel = std::move(k);
// Configure border depending on operation required (quantize border in case of asymmetric data_type)
- BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
- PixelValue zero_value(0.f);
+ BorderMode border_mode{};
+ PixelValue pixel_value(0.f);
if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
{
- zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+ pixel_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
}
- _border_handler.configure(input, _kernel->border_size(), border_mode, zero_value);
+ switch(input->info()->data_layout())
+ {
+ case DataLayout::NCHW:
+ border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+ break;
+ case DataLayout::NHWC:
+ border_mode = BorderMode::CONSTANT;
+ if(PoolingType::MAX == pool_info.pool_type() && !is_data_type_quantized_asymmetric(input->info()->data_type()))
+ {
+ pixel_value = PixelValue(std::numeric_limits<float>::lowest());
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data layout not supported");
+ }
+ _border_handler.configure(input, _kernel->border_size(), border_mode, pixel_value);
}
Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index ed1f51c..a13859c 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
+#include "arm_compute/core/Error.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
using namespace arm_compute;
@@ -33,8 +34,21 @@
{
}
+Status CLQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+ TensorInfo min_max{ input->num_channels(), input->data_type() };
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMinMaxLayerKernel::validate(input, &min_max));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayerKernel::validate(input, output, &min_max));
+
+ return Status{};
+}
+
void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
// Configure min-max kernel. _min_max tensor will be auto-configured within the kernel.
_min_max_kernel.configure(input, &_min_max);
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
new file mode 100644
index 0000000..4843ba6
--- /dev/null
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLRNNLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output()
+{
+}
+
+Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
+ const ITensorInfo *output, const ActivationLayerInfo &info)
+{
+ const int idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != recurrent_weights->dimension(idx_width));
+ ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != recurrent_weights->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape());
+
+ auto shape_info = TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info, true, false));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
+
+ return Status{};
+}
+
+void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output,
+ ActivationLayerInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+ ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
+
+ const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ TensorShape shape = compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+
+ _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+ _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+
+ // Manage intermediate buffers and configure
+ _memory_group.manage(&_fully_connected_out);
+ _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out, true, false);
+
+ _memory_group.manage(&_gemm_output);
+ _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+
+ _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+ _memory_group.manage(&_add_output);
+
+ _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
+
+ _fully_connected_out.allocator()->allocate();
+ _gemm_output.allocator()->allocate();
+
+ _activation_kernel.configure(&_add_output, hidden_state, info);
+ _add_output.allocator()->allocate();
+
+ _copy_kernel.configure(hidden_state, output);
+}
+
+void CLRNNLayer::run()
+{
+ _memory_group.acquire();
+ _fully_connected_kernel.run();
+ _gemm_state_f.run();
+ CLScheduler::get().enqueue(_add_kernel);
+ CLScheduler::get().enqueue(_activation_kernel);
+
+ // copy hidden out to output
+ CLScheduler::get().enqueue(_copy_kernel);
+ _memory_group.release();
+}
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index d02afb4..3a5133d 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,19 +35,64 @@
using namespace arm_compute;
+namespace
+{
+unsigned int calculate_number_of_stages(const ITensorInfo *input)
+{
+ // Calculate number of WGs. 16 elements per thread, 8 threads per WG
+ const unsigned int num_of_wg = ceil(input->dimension(0) / 128.f);
+
+ // Calculate number of stages. First stage performs op and the rest reduction sum
+ // depending on the size of the input. Last stage should have only 1 WG.
+ const unsigned int num_of_stages = num_of_wg / 128 + 2;
+
+ return num_of_stages;
+}
+} // namespace
+
CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages()
{
}
+Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+{
+ const unsigned int num_of_stages = calculate_number_of_stages(input);
+
+ // Create temporary tensor infos
+ auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
+
+ // Create intermediate tensor info
+ TensorShape shape{ input->tensor_shape() };
+
+ for(unsigned int i = 0; i < num_of_stages - 1; i++)
+ {
+ shape.set(0, ceil(shape.x() / 128.f));
+ sums_vector[i].set_data_type(input->data_type());
+ sums_vector[i].set_tensor_shape(shape);
+ sums_vector[i].set_num_channels(input->num_channels());
+ sums_vector[i].set_fixed_point_position(input->fixed_point_position());
+ }
+
+ // Validate ReductionOperation only on first kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, sums_vector.get(), axis, op));
+
+ // Validate ReductionOperation on intermediate stages
+ for(unsigned int i = 1; i < num_of_stages - 1; ++i)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, op));
+ }
+
+ // Validate ReductionOperation on the last stage
+ const unsigned int last_stage = num_of_stages - 1;
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, op));
+
+ return Status{};
+}
+
void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
{
- // Calculate number of WGs. 16 elements per thread, 8 threads per WG
- unsigned int num_of_wg = ceil(input->info()->dimension(0) / 128.f);
-
- // Calculate number of stages. First stage performs op and the rest reduction sum
- // depending on the size of the input. Last stage should have only 1 WG.
- _num_of_stages = num_of_wg / 128 + 2;
+ _num_of_stages = calculate_number_of_stages(input->info());
// Create temporary tensors
_sums_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
@@ -95,4 +140,4 @@
}
_memory_group.release();
-}
\ No newline at end of file
+}
diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
new file mode 100644
index 0000000..d542781
--- /dev/null
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLWidthConcatenateLayer::CLWidthConcatenateLayer() // NOLINT
+ : _concat_kernels_vector(),
+ _num_inputs(0)
+{
+}
+
+Status CLWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output) // NOLINT
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
+
+ // Output auto inizialitation if not yet initialized
+ TensorInfo tmp_output_info = *output->clone();
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+ auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type(), inputs_vector[0]->fixed_point_position());
+
+ unsigned int width_offset = 0;
+ for(const auto &input : inputs_vector)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
+ width_offset += input->dimension(0);
+ }
+
+ return Status{};
+}
+
+void CLWidthConcatenateLayer::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output) // NOLINT
+{
+ _num_inputs = inputs_vector.size();
+
+ std::vector<ITensorInfo *> inputs_vector_info;
+ for(unsigned int i = 0; i < _num_inputs; i++)
+ {
+ inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
+ }
+ TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position());
+ ARM_COMPUTE_ERROR_THROW_ON(CLWidthConcatenateLayer::validate(inputs_vector_info, output->info()));
+
+ unsigned int width_offset = 0;
+
+ _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLWidthConcatenateLayerKernel[]>(_num_inputs);
+
+ for(unsigned int i = 0; i < _num_inputs; i++)
+ {
+ _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
+ width_offset += inputs_vector.at(i)->info()->dimension(0);
+ }
+}
+
+void CLWidthConcatenateLayer::run()
+{
+ cl::CommandQueue q = CLScheduler::get().queue();
+
+ for(unsigned i = 0; i < _num_inputs; i++)
+ {
+ CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+ }
+}
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
new file mode 100644
index 0000000..49753ad
--- /dev/null
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims)
+{
+ Size2D output_tile = Size2D{};
+
+ if(kernel_dims == Size2D(3U, 3U))
+ {
+ output_tile = (input_dims.width <= 4 && input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+ }
+ else if(kernel_dims == Size2D(5U, 5U))
+ {
+ output_tile = Size2D(4U, 4U);
+ }
+
+ return output_tile;
+}
+
+bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size)
+{
+ // Check if we want to configure a Winograd configuration which requires fast math
+ using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+ std::vector<WinogradConfiguration> fast_math_winograd =
+ {
+ WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
+ };
+
+ auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
+ std::pair<int, int>(kernel_size.width, kernel_size.height));
+
+ return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
+}
+} // namespace
+
+CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _activationlayer_function(), _input0(), _input1(), _batched_mm_output(),
+ _original_weights(nullptr), _is_prepared(false), _is_activationlayer_enabled(false)
+{
+}
+
+void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
+{
+ // Get indices for the width and height
+ const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+
+ // Input shape, kernel size and output tile
+ const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+ const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+ const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+
+ // Check if the Winograd configuration requires fast math
+ if(!enable_fast_math)
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+ }
+
+ const WinogradInfo winograd_info = WinogradInfo(output_tile,
+ kernel_size,
+ input_dims,
+ conv_info,
+ input->info()->data_layout());
+
+ _is_prepared = false;
+ _original_weights = weights;
+
+ // Manage intermediate tensors
+ _memory_group.manage(&_input0);
+ _memory_group.manage(&_batched_mm_output);
+
+ // Do not manage _input1 as it contains the weights
+
+ // Configure input transform
+ _input_transform.configure(input, &_input0, winograd_info);
+
+ // Configure filter transform
+ _filter_transform.configure(weights, &_input1, winograd_info);
+
+ // Configure batched matrix multiply
+ _batched_mm.configure(&_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+
+ // Configure output transform
+ _output_transform.configure(&_batched_mm_output, biases, output, winograd_info);
+
+ // Configure activation layer
+ _is_activationlayer_enabled = act_info.enabled();
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
+
+ // Allocate temporary tensors
+ _input0.allocator()->allocate();
+ _batched_mm_output.allocator()->allocate();
+}
+
+Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info, bool enable_fast_math)
+{
+ // Get indeces for the width and height
+ const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+ // Input shape, kernel size and output tile
+ const Size2D input_dims = Size2D(input->tensor_shape()[idx_width], input->tensor_shape()[idx_height]);
+ const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+ const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+
+ // Check if the Winograd configuration requires fast math
+ if(!enable_fast_math)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+ }
+
+ const WinogradInfo winograd_info = WinogradInfo(output_tile,
+ kernel_size,
+ input_dims,
+ conv_info,
+ input->data_layout());
+
+ // Validate input transform
+ const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+ const TensorInfo input0 = input->clone()->set_tensor_shape(input0_shape);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradInputTransform::validate(input, &input0, winograd_info));
+
+ // Validate filter transform
+ const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
+ const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradFilterTransformKernel::validate(weights, &input1, winograd_info));
+
+ // Validate batched matrix multiply
+ TensorShape batched_mm_output_shape = input0.tensor_shape();
+ batched_mm_output_shape[0] = input1.tensor_shape()[0];
+ const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/)));
+
+ // Configure output transform
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradOutputTransformKernel::validate(&batched_mm_output, biases, output, winograd_info));
+
+ // Validate Activation Layer
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+ }
+
+ return Status{};
+}
+
+void CLWinogradConvolutionLayer::run()
+{
+ prepare();
+
+ _memory_group.acquire();
+
+ // Run input transform
+ _input_transform.run();
+
+ // Run batched matrix multiplication
+ _batched_mm.run();
+
+ // Run output transform
+ CLScheduler::get().enqueue(_output_transform);
+
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
+
+ _memory_group.release();
+}
+
+void CLWinogradConvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ // Run filter transform and mark original weights as unused
+ _input1.allocator()->allocate();
+ CLScheduler::get().enqueue(_filter_transform, false);
+ _original_weights->mark_as_unused();
+
+ // Prepare GEMM and release reshaped weights if marked unused by CLGEMM
+ _batched_mm.prepare();
+ if(!_input1.is_used())
+ {
+ _input1.allocator()->free();
+ }
+
+ CLScheduler::get().queue().finish();
+ _is_prepared = true;
+ }
+}
diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
new file mode 100644
index 0000000..09e8456
--- /dev/null
+++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
+#include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLWinogradInputTransform::configure(ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLWinogradInputTransformKernel>();
+ k->configure(input, output, winograd_info);
+ _kernel = std::move(k);
+ _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradInputTransformKernel::validate(input, output, winograd_info));
+ return Status{};
+}
diff --git a/src/runtime/CL/tuners/BifrostTuner.cpp b/src/runtime/CL/tuners/BifrostTuner.cpp
new file mode 100644
index 0000000..c0ebd24
--- /dev/null
+++ b/src/runtime/CL/tuners/BifrostTuner.cpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/tuners/BifrostTuner.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernels.h"
+#include "arm_compute/core/utils/misc/Cast.h"
+
+namespace arm_compute
+{
+namespace tuners
+{
+namespace
+{
+/** Tunes a @ref CLDirectConvolutionLayerKernel for a bifrost target
+ *
+ * @param[in] k Kernels to tune
+ */
+void tune_direct_convolution_kernel(CLDirectConvolutionLayerKernel &k)
+{
+ cl::NDRange lws_hint = k.lws_hint();
+
+ const GPUTarget gpu_target = k.get_target();
+ const DataType dt = k._input->info()->data_type();
+ const TensorShape weights_shape = k._weights->info()->tensor_shape();
+ const TensorShape inputs_shape = k._input->info()->tensor_shape();
+ const size_t kernel_size = weights_shape.x();
+ const unsigned int stride_x = k._conv_stride_x;
+ const unsigned int stride_y = k._conv_stride_y;
+
+ if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && (kernel_size <= 5) && (stride_x == 1) && (stride_y == 1) && (dt == DataType::F32))
+ {
+ // Through extensive experimentation with over 30 representative tensor
+ // shapes, we found a small number of local work size configurations
+ // that result in nearly optimal execution times. Selecting the right
+ // lws for a given shape, however, required a complex decision tree,
+ // until we constructed a simple feature as described below.
+ //
+ // We started from the number of multiply-accumulate operations for a
+ // convolution layer, which is equal to the product of the input
+ // dimensions 0..2 and the weights dimensions 0..2. Unfortunately,
+ // this resulted in ties between distinct shapes that required distinct
+ // lws configurations. Replacing the width of the input with the kernel
+ // size, however, resulted in nearly optimal predictions. We use underscores
+ // in variable names to indicate when they are intentionally misleading.
+ const size_t product_of_weights_dimensions = weights_shape[0] * weights_shape[1] * weights_shape[2];
+ const size_t product_of_input_dimensions_ = inputs_shape[0] * inputs_shape[1] * inputs_shape[2];
+ const float mega_ops_ = 1e-6 * product_of_weights_dimensions * product_of_input_dimensions_;
+
+ switch(kernel_size)
+ {
+ case 1:
+ {
+ if(mega_ops_ < 1.f)
+ {
+ lws_hint = cl::NDRange(1, 1, 8);
+ }
+ else if(mega_ops_ < 7.f)
+ {
+ lws_hint = cl::NDRange(1, 1, 4);
+ }
+ else
+ {
+ lws_hint = cl::NDRange(1, 1, 2);
+ }
+ break;
+ }
+ case 3:
+ {
+ if(mega_ops_ < 1.f)
+ {
+ lws_hint = cl::NDRange(1, 1, 8);
+ }
+ else if(mega_ops_ < 13.f)
+ {
+ lws_hint = cl::NDRange(2, 1, 4);
+ }
+ else if(mega_ops_ < 50.f)
+ {
+ lws_hint = cl::NDRange(3, 1, 4);
+ }
+ else
+ {
+ lws_hint = cl::NDRange(2, 1, 6);
+ }
+ break;
+ }
+ case 5:
+ {
+ if(mega_ops_ < 2.f || mega_ops_ > 80.f)
+ {
+ lws_hint = cl::NDRange(2, 1, 4);
+ }
+ else
+ {
+ lws_hint = cl::NDRange(2, 1, 8);
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ k.set_lws_hint(lws_hint);
+ }
+}
+} // namespace
+
+void BifrostTuner::tune_kernel_static(ICLKernel &kernel)
+{
+ // Continue on tuning if dynamic tuning
+ if(dynamic_cast<CLDirectConvolutionLayerKernel *>(&kernel) != nullptr)
+ {
+ tune_direct_convolution_kernel(*utils::cast::polymorphic_downcast<CLDirectConvolutionLayerKernel *>(&kernel));
+ }
+}
+
+void BifrostTuner::tune_kernel_dynamic(ICLKernel &kernel)
+{
+ ARM_COMPUTE_UNUSED(kernel);
+}
+} // namespace tuners
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 168ed6e..92dce34 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/CPUUtils.h"
#include <condition_variable>
#include <iostream>
@@ -159,6 +160,7 @@
: _num_threads(num_threads_hint()),
_threads(_num_threads - 1)
{
+ get_cpu_configuration(_cpu_info);
}
void CPPScheduler::set_num_threads(unsigned int num_threads)
@@ -178,7 +180,7 @@
/** [Scheduler example] */
ThreadInfo info;
- info.cpu_info = _info;
+ info.cpu_info = &_cpu_info;
const Window &max_window = kernel->window();
const unsigned int num_iterations = max_window.num_iterations(split_dimension);
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index c8285b4..2adc14c 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,7 @@
{
ARM_COMPUTE_UNUSED(split_dimension);
ThreadInfo info;
- info.cpu_info = cpu_info();
+ info.cpu_info = &_cpu_info;
kernel->run(kernel->window(), info);
}
diff --git a/src/runtime/CPP/functions/CPPUpsample.cpp b/src/runtime/CPP/functions/CPPUpsample.cpp
new file mode 100644
index 0000000..619b7e1
--- /dev/null
+++ b/src/runtime/CPP/functions/CPPUpsample.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
+
+#include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CPPUpsample::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info, unsigned int inner_border_right, unsigned int inner_border_top)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CPPUpsampleKernel>();
+ k->configure(input, output, info, inner_border_right, inner_border_top);
+ _kernel = std::move(k);
+}
\ No newline at end of file
diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
new file mode 100644
index 0000000..7e8bf2b
--- /dev/null
+++ b/src/runtime/CPUUtils.cpp
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPUUtils.h"
+
+#include "arm_compute/core/CPP/CPPTypes.h"
+#include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
+
+#include <array>
+#include <cstdlib>
+#include <cstring>
+#include <fcntl.h>
+#include <fstream>
+#include <map>
+#include <sched.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#ifndef BARE_METAL
+#include <regex>
+#include <thread>
+#endif /* BARE_METAL */
+
+#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
+#include <sys/auxv.h>
+
+/* Get HWCAP bits from asm/hwcap.h */
+#include <asm/hwcap.h>
+#endif /* !BARE_METAL */
+
+/* Make sure the bits we care about are defined, just in case asm/hwcap.h is
+ * out of date (or for bare metal mode) */
+#ifndef HWCAP_ASIMDHP
+#define HWCAP_ASIMDHP (1 << 10)
+#endif /* HWCAP_ASIMDHP */
+
+#ifndef HWCAP_CPUID
+#define HWCAP_CPUID (1 << 11)
+#endif /* HWCAP_CPUID */
+
+#ifndef HWCAP_ASIMDDP
+#define HWCAP_ASIMDDP (1 << 20)
+#endif /* HWCAP_ASIMDDP */
+
+namespace
+{
+using namespace arm_compute;
+
+#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
+struct PerCPUData
+{
+ CPUModel model = CPUModel::GENERIC;
+ unsigned int midr = 0;
+ bool model_set = false;
+};
+
+/* Convert an MIDR register value to a CPUModel enum value. */
+CPUModel midr_to_model(const unsigned int midr)
+{
+ CPUModel model;
+
+ // Unpack variant and CPU ID
+ const int variant = (midr >> 20) & 0xF;
+ const int cpunum = (midr >> 4) & 0xFFF;
+
+ // Only CPUs we have code paths for are detected. All other CPUs can be safely classed as "GENERIC"
+ switch(cpunum)
+ {
+ case 0xd03:
+ model = CPUModel::A53;
+ break;
+
+ case 0xd05:
+ if(variant != 0)
+ {
+ model = CPUModel::A55r1;
+ }
+ else
+ {
+ model = CPUModel::A55r0;
+ }
+ break;
+
+ default:
+ model = CPUModel::GENERIC;
+ break;
+ }
+
+ return model;
+}
+
+void populate_models_cpuid(std::vector<PerCPUData> &cpusv)
+{
+ // If the CPUID capability is present, MIDR information is provided in /sys. Use that to populate the CPU model table.
+ uint32_t i = 0;
+ for(auto &c : cpusv)
+ {
+ std::stringstream str;
+ str << "/sys/devices/system/cpu/cpu" << i++ << "/regs/identification/midr_el1";
+ std::ifstream file;
+ file.open(str.str(), std::ios::in);
+ if(file.is_open())
+ {
+ std::string line;
+ if(bool(getline(file, line)))
+ {
+ const unsigned long midr = support::cpp11::stoul(line, nullptr, 16);
+ c.midr = (midr & 0xffffffff);
+ c.model = midr_to_model(c.midr);
+ c.model_set = true;
+ }
+ }
+ }
+}
+
+void populate_models_cpuinfo(std::vector<PerCPUData> &cpusv)
+{
+ // If "long-form" cpuinfo is present, parse that to populate models.
+ std::regex proc_regex("^processor.*(\\d+)$");
+ std::regex imp_regex("^CPU implementer.*0x(..)$");
+ std::regex var_regex("^CPU variant.*0x(.)$");
+ std::regex part_regex("^CPU part.*0x(...)$");
+ std::regex rev_regex("^CPU revision.*(\\d+)$");
+
+ std::ifstream file;
+ file.open("/proc/cpuinfo", std::ios::in);
+
+ if(file.is_open())
+ {
+ std::string line;
+ int midr = 0;
+ int curcpu = -1;
+
+ while(bool(getline(file, line)))
+ {
+ std::smatch match;
+
+ if(std::regex_match(line, match, proc_regex))
+ {
+ std::string id = match[1];
+ int newcpu = support::cpp11::stoi(id, nullptr, 0);
+
+ if(curcpu >= 0 && midr == 0)
+ {
+ // Matched a new CPU ID without any description of the previous one - looks like old format.
+ return;
+ }
+
+ if(curcpu >= 0)
+ {
+ cpusv[curcpu].midr = midr;
+ cpusv[curcpu].model = midr_to_model(midr);
+ cpusv[curcpu].model_set = true;
+ }
+
+ midr = 0;
+ curcpu = newcpu;
+
+ continue;
+ }
+
+ if(std::regex_match(line, match, imp_regex))
+ {
+ int impv = support::cpp11::stoi(match[1], nullptr, 16);
+ midr |= (impv << 24);
+ continue;
+ }
+
+ if(std::regex_match(line, match, var_regex))
+ {
+ int varv = support::cpp11::stoi(match[1], nullptr, 16);
+ midr |= (varv << 16);
+ continue;
+ }
+
+ if(std::regex_match(line, match, part_regex))
+ {
+ int partv = support::cpp11::stoi(match[1], nullptr, 16);
+ midr |= (partv << 4);
+ continue;
+ }
+
+ if(std::regex_match(line, match, rev_regex))
+ {
+ int regv = support::cpp11::stoi(match[1], nullptr, 10);
+ midr |= (regv);
+ midr |= (0xf << 16);
+ continue;
+ }
+ }
+
+ if(curcpu >= 0)
+ {
+ cpusv[curcpu].midr = midr;
+ cpusv[curcpu].model = midr_to_model(midr);
+ cpusv[curcpu].model_set = true;
+ }
+ }
+}
+
+int get_max_cpus()
+{
+ int max_cpus = 1;
+#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
+ std::ifstream CPUspresent;
+ CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in);
+ bool success = false;
+
+ if(CPUspresent.is_open())
+ {
+ std::string line;
+
+ if(bool(getline(CPUspresent, line)))
+ {
+ /* The content of this file is a list of ranges or single values, e.g.
+ * 0-5, or 1-3,5,7 or similar. As we are interested in the
+ * max valid ID, we just need to find the last valid
+ * delimiter ('-' or ',') and parse the integer immediately after that.
+ */
+ auto startfrom = line.begin();
+
+ for(auto i = line.begin(); i < line.end(); ++i)
+ {
+ if(*i == '-' || *i == ',')
+ {
+ startfrom = i + 1;
+ }
+ }
+
+ line.erase(line.begin(), startfrom);
+
+ max_cpus = support::cpp11::stoi(line, nullptr, 0) + 1;
+ success = true;
+ }
+ }
+
+ // Return std::thread::hardware_concurrency() as a fallback.
+ if(!success)
+ {
+ max_cpus = std::thread::hardware_concurrency();
+ }
+#endif /* BARE_METAL */
+
+ return max_cpus;
+}
+#endif /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
+
+} // namespace
+
+namespace arm_compute
+{
+void get_cpu_configuration(CPUInfo &cpuinfo)
+{
+#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
+ bool cpuid = false;
+ bool fp16_support = false;
+ bool dot_support = false;
+
+ const uint32_t hwcaps = getauxval(AT_HWCAP);
+
+ if((hwcaps & HWCAP_CPUID) != 0)
+ {
+ cpuid = true;
+ }
+
+ if((hwcaps & HWCAP_ASIMDHP) != 0)
+ {
+ fp16_support = true;
+ }
+
+ if((hwcaps & HWCAP_ASIMDDP) != 0)
+ {
+ dot_support = true;
+ }
+
+#ifdef __aarch64__
+ /* Pre-4.15 kernels don't have the ASIMDDP bit.
+ *
+ * Although the CPUID bit allows us to read the feature register
+ * directly, the kernel quite sensibly masks this to only show
+ * features known by it to be safe to show to userspace. As a
+ * result, pre-4.15 kernels won't show the relevant bit in the
+ * feature registers either.
+ *
+ * So for now, use a whitelist of CPUs known to support the feature.
+ */
+ if(!dot_support && cpuid)
+ {
+ /* List of CPUs with dot product support: A55r1 A75r1 A75r2 */
+ const unsigned int dotprod_whitelist_masks[] = { 0xfff0fff0, 0xfff0fff0, 0xfff0fff0, 0 };
+ const unsigned int dotprod_whitelist_values[] = { 0x4110d050, 0x4110d0a0, 0x4120d0a0, 0 };
+
+ unsigned long cpuid;
+
+ __asm __volatile(
+ "mrs %0, midr_el1\n"
+ : "=r"(cpuid)
+ :
+ : );
+
+ for(int i = 0; dotprod_whitelist_values[i] != 0; i++)
+ {
+ if((cpuid & dotprod_whitelist_masks[i]) == dotprod_whitelist_values[i])
+ {
+ dot_support = true;
+ break;
+ }
+ }
+ }
+#endif /* __aarch64__ */
+ const unsigned int max_cpus = get_max_cpus();
+ cpuinfo.set_cpu_num(max_cpus);
+ cpuinfo.set_fp16(fp16_support);
+ cpuinfo.set_dotprod(dot_support);
+ std::vector<PerCPUData> percpu(max_cpus);
+ if(cpuid)
+ {
+ populate_models_cpuid(percpu);
+ }
+ else
+ {
+ populate_models_cpuinfo(percpu);
+ }
+ int j(0);
+ for(const auto &v : percpu)
+ {
+ cpuinfo.set_cpu_model(j++, v.model);
+ }
+#else /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
+ ARM_COMPUTE_UNUSED(cpuinfo);
+#endif /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
+}
+
+unsigned int get_threads_hint()
+{
+ unsigned int num_threads_hint = 1;
+
+#ifndef BARE_METAL
+ std::map<std::string, unsigned int> cpu_part_occurrence_map;
+
+ // CPU part regex
+ std::regex cpu_part_rgx(R"(.*CPU part.+?(?=:).+?(?=\w+)(\w+).*)");
+ std::smatch cpu_part_match;
+
+ // Read cpuinfo and get occurrence of each core
+ std::ifstream cpuinfo;
+ cpuinfo.open("/proc/cpuinfo", std::ios::in);
+ if(cpuinfo.is_open())
+ {
+ std::string line;
+ while(bool(getline(cpuinfo, line)))
+ {
+ if(std::regex_search(line.cbegin(), line.cend(), cpu_part_match, cpu_part_rgx))
+ {
+ std::string cpu_part = cpu_part_match[1];
+ if(cpu_part_occurrence_map.find(cpu_part) != cpu_part_occurrence_map.end())
+ {
+ cpu_part_occurrence_map[cpu_part]++;
+ }
+ else
+ {
+ cpu_part_occurrence_map[cpu_part] = 1;
+ }
+ }
+ }
+ }
+
+ // Get min number of threads
+ auto min_common_cores = std::min_element(cpu_part_occurrence_map.begin(), cpu_part_occurrence_map.end(),
+ [](const std::pair<std::string, unsigned int> &p1, const std::pair<std::string, unsigned int> &p2)
+ {
+ return p1.second < p2.second;
+ });
+
+ // Set thread hint
+ num_threads_hint = cpu_part_occurrence_map.empty() ? std::thread::hardware_concurrency() : min_common_cores->second;
+#endif /* BARE_METAL */
+
+ return num_threads_hint;
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp b/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp
new file mode 100644
index 0000000..cdd12c3
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+void *GCBufferAllocator::allocate(size_t size, size_t alignment)
+{
+ ARM_COMPUTE_UNUSED(alignment);
+ auto *gl_buffer = new GLBufferWrapper();
+ ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, gl_buffer->_ssbo_name));
+ ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(size), nullptr, GL_STATIC_DRAW));
+ ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
+
+ return reinterpret_cast<void *>(gl_buffer);
+}
+
+void GCBufferAllocator::free(void *ptr)
+{
+ ARM_COMPUTE_ERROR_ON(ptr == nullptr);
+ auto *gl_buffer = reinterpret_cast<GLBufferWrapper *>(ptr);
+ delete gl_buffer;
+}
+
+std::unique_ptr<IMemoryRegion> GCBufferAllocator::make_region(size_t size, size_t alignment)
+{
+ ARM_COMPUTE_UNUSED(size, alignment);
+ return nullptr;
+}
+} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/GCScheduler.cpp b/src/runtime/GLES_COMPUTE/GCScheduler.cpp
index fcc8559..f781273 100644
--- a/src/runtime/GLES_COMPUTE/GCScheduler.cpp
+++ b/src/runtime/GLES_COMPUTE/GCScheduler.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,7 @@
#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
using namespace arm_compute;
@@ -31,7 +32,7 @@
std::once_flag GCScheduler::_initialize_symbols;
GCScheduler::GCScheduler()
- : _display(EGL_NO_DISPLAY), _context(EGL_NO_CONTEXT)
+ : _display(EGL_NO_DISPLAY), _context(EGL_NO_CONTEXT), _target(GPUTarget::MIDGARD)
{
}
@@ -48,11 +49,13 @@
{
setup_context();
- GCKernelLibrary::get().init("./cs_shaders/", _display, _context);
+ init(_display, _context);
}
void GCScheduler::init(EGLDisplay dpy, EGLContext ctx)
{
+ _target = get_target_from_device();
+
GCKernelLibrary::get().init("./cs_shaders/", dpy, ctx);
}
diff --git a/src/runtime/GLES_COMPUTE/GCTensor.cpp b/src/runtime/GLES_COMPUTE/GCTensor.cpp
index edbd16d..e193d26 100644
--- a/src/runtime/GLES_COMPUTE/GCTensor.cpp
+++ b/src/runtime/GLES_COMPUTE/GCTensor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,7 +27,7 @@
using namespace arm_compute;
GCTensor::GCTensor()
- : _allocator()
+ : _allocator(this)
{
}
diff --git a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp b/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
index 694b34f..abd2b48 100644
--- a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
+++ b/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,11 +31,16 @@
using namespace arm_compute;
-GCTensorAllocator::GCTensorAllocator()
- : _gl_buffer(), _mapping(nullptr)
+GCTensorAllocator::GCTensorAllocator(GCTensor *owner)
+ : _associated_memory_group(nullptr), _gl_buffer(), _mapping(nullptr), _owner(owner)
{
}
+GCTensorAllocator::~GCTensorAllocator()
+{
+ _gl_buffer = support::cpp14::make_unique<GLBufferWrapper>();
+}
+
uint8_t *GCTensorAllocator::data()
{
return _mapping;
@@ -43,17 +48,35 @@
void GCTensorAllocator::allocate()
{
- _gl_buffer = support::cpp14::make_unique<GLBufferWrapper>();
- ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _gl_buffer->_ssbo_name));
- ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(info().total_size()), nullptr, GL_STATIC_DRAW));
- ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
+ if(_associated_memory_group == nullptr)
+ {
+ _gl_buffer = support::cpp14::make_unique<GLBufferWrapper>();
+ ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _gl_buffer->_ssbo_name));
+ ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(info().total_size()), nullptr, GL_STATIC_DRAW));
+ ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
+ }
+ else
+ {
+ _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(&_gl_buffer), info().total_size());
+ }
info().set_is_resizable(false);
}
void GCTensorAllocator::free()
{
- _gl_buffer.reset();
- info().set_is_resizable(true);
+ if(_associated_memory_group == nullptr)
+ {
+ _gl_buffer.reset();
+ info().set_is_resizable(true);
+ }
+}
+
+void GCTensorAllocator::set_associated_memory_group(GCMemoryGroup *associated_memory_group)
+{
+ ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
+ ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
+ ARM_COMPUTE_ERROR_ON(_gl_buffer.get() != nullptr);
+ _associated_memory_group = associated_memory_group;
}
uint8_t *GCTensorAllocator::lock()
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
index 1d2370e..2a710f7 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
@@ -37,14 +37,14 @@
using namespace arm_compute;
GCConvolutionLayerReshapeWeights::GCConvolutionLayerReshapeWeights()
- : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+ : _weights_reshape_kernel(), _weights_reshaped()
{
}
-void GCConvolutionLayerReshapeWeights::configure(const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, bool transpose1xW)
+void GCConvolutionLayerReshapeWeights::configure(const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
if(biases != nullptr)
@@ -56,73 +56,66 @@
}
const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
- const unsigned bias_element = (append_biases) ? 1 : 0;
const IGCTensor *biases_to_use = (append_biases) ? biases : nullptr;
- _transpose1xW = transpose1xW;
-
- if(transpose1xW)
- {
- // Create tensor to store the reshaped weights
- const unsigned int mat_weights_cols = weights->info()->dimension(3);
- const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
- TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
- const DataType dt = weights->info()->data_type();
- const int fixed_point_position = weights->info()->fixed_point_position();
- TensorInfo info_wr(shape_wr, 1, dt, fixed_point_position);
-
- _weights_reshaped.allocator()->init(info_wr);
- _weights_reshape_kernel.configure(weights, biases_to_use, &_weights_reshaped);
- _weights_transposed_kernel.configure(&_weights_reshaped, output);
- _weights_reshaped.allocator()->allocate();
- }
- else
- {
- _weights_reshape_kernel.configure(weights, biases_to_use, output);
- }
+ _weights_reshape_kernel.configure(weights, biases_to_use, output);
}
void GCConvolutionLayerReshapeWeights::run()
{
GCScheduler::get().dispatch(_weights_reshape_kernel);
- if(_transpose1xW)
- {
- GCScheduler::get().dispatch(_weights_transposed_kernel);
- }
}
-GCConvolutionLayer::GCConvolutionLayer()
- : _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _fill_border(), _input_im2col_reshaped(), _input_interleaved_reshaped(),
- _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _append_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+GCConvolutionLayer::GCConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reshape_weights(), _input_im2col_kernel(), _mm_gemm(), _output_col2im_kernel(), _fill_border(), _activationlayer_function(), _original_weights(nullptr),
+ _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _is_first_run(true), _is_activationlayer_enabled(false)
{
}
-void GCConvolutionLayer::configure_mm(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output, bool is_interleaved_transposed)
+void GCConvolutionLayer::configure_mm(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output)
{
- _mm_kernel.configure(input, weights, output, 1.f, is_interleaved_transposed);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info()));
+
+ _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */));
}
-void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+Status GCConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output)
{
+ // Perform validation step on Matrix multiply function
+ GCGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */));
+ return Status{};
+}
+
+void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+ const Size2D &dilation, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+ _is_first_run = true;
+ _original_weights = weights;
+
if(biases != nullptr)
{
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
+ ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
}
const DataType dt = input->info()->data_type();
- _append_bias = (biases != nullptr);
- _are_weights_reshaped = weights_info.are_reshaped();
+ // Set the GPU target for im2col and col2im
+ _input_im2col_kernel.set_target(GCScheduler::get().get_target());
+ _output_col2im_kernel.set_target(GCScheduler::get().get_target());
- const unsigned bias_element = (_append_bias) ? 1 : 0;
- const IGCTensor *biases_to_use = (_append_bias) ? biases : nullptr;
+ const bool append_bias = (biases != nullptr);
+ const unsigned bias_element = (append_bias) ? 1 : 0;
+ const IGCTensor *biases_to_use = (append_bias) ? biases : nullptr;
// Get parameters from conv_info
unsigned int stride_x = 0;
@@ -133,57 +126,19 @@
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- const unsigned int kernel_width = (_are_weights_reshaped) ? weights_info.kernel_size().first : weights->info()->dimension(0);
- const unsigned int kernel_height = (_are_weights_reshaped) ? weights_info.kernel_size().second : weights->info()->dimension(1);
+ const unsigned int kernel_width = weights->info()->dimension(0);
+ const unsigned int kernel_height = weights->info()->dimension(1);
std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
- conv_info);
-
- // Check if its a "fully connected" convolution
- _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
- const bool run_interleaved = (!_is_fully_connected_convolution);
+ conv_info, dilation);
unsigned int mat_weights_cols = weights->info()->dimension(3);
unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
- // Reshape weights if needed
- if(_are_weights_reshaped)
- {
- if(_is_fully_connected_convolution)
- {
- mat_weights_cols = weights->info()->dimension(0);
- mat_weights_rows = weights->info()->dimension(1);
- }
- else
- {
- mat_weights_cols = weights_info.num_kernels();
- const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
- mat_weights_rows = quarter_reshaped_cols + bias_element;
- }
- }
- else
- {
- if(_is_fully_connected_convolution)
- {
- // Create tensor to store the reshaped weights
- int num_elems_read_per_iteration_x = 1;
- if(dt == DataType::F16)
- {
- num_elems_read_per_iteration_x = 2;
- }
- TensorShape shape_wr((ceil_to_multiple(mat_weights_cols, num_elems_read_per_iteration_x)), mat_weights_rows);
- _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_wr));
- _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, false /* 1xW transpose */);
- }
- else
- {
- // Create tensor to store transposed weights
- const float transpose_width = 16.0f / input->info()->element_size();
- TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
- _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_wt));
- _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, true /* 1xW transpose */);
- }
- weights = &_weights_reshaped;
- }
+ // _weights_reshaped will be auto configured in the kernel.
+ // Just append biases and do not transpose 1xW as it will be reshaped in GCGEMM
+ _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped);
+
+ weights = &_weights_reshaped;
// Create tensor to store im2col reshaped inputs
const unsigned int mat_input_cols = mat_weights_rows;
@@ -195,17 +150,7 @@
TensorInfo im2col_reshaped_info(shape_im2col, 1, dt, input->info()->fixed_point_position());
_input_im2col_reshaped.allocator()->init(im2col_reshaped_info);
-
- // Create tensor (interleave) to prepare input tensor for GEMM
- if(run_interleaved)
- {
- TensorShape shape_interleaved = shape_im2col;
- shape_interleaved.set(0, shape_interleaved.x() * 4);
- shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
-
- TensorInfo interleaved_info(shape_interleaved, 1, dt, input->info()->fixed_point_position());
- _input_interleaved_reshaped.allocator()->init(interleaved_info);
- }
+ _memory_group.manage(&_input_im2col_reshaped);
// Create GEMM output tensor
TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
@@ -215,27 +160,20 @@
TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
_gemm_output.allocator()->init(info_gemm);
+ _memory_group.manage(&_gemm_output);
- // Configure kernels
if(dt == DataType::F16)
{
BorderSize border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
input->info()->extend_padding(border_size);
_fill_border.configure(input, border_size, BorderMode::CONSTANT, PixelValue(0)); // for PAD of im2col fp16: consider it as border
}
- _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias);
+ // Configure im2col
+ _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation);
- // Configure matrix multiply
- if(run_interleaved)
- {
- _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
- configure_mm(&_input_interleaved_reshaped, weights, &_gemm_output);
- _input_interleaved_reshaped.allocator()->allocate();
- }
- else
- {
- configure_mm(&_input_im2col_reshaped, weights, &_gemm_output, false);
- }
+ // Configure GEMM
+ configure_mm(&_input_im2col_reshaped, weights, &_gemm_output);
+
_input_im2col_reshaped.allocator()->allocate();
// Configure Col2Im
@@ -245,38 +183,53 @@
ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
// Allocate intermediate tensor
- if(!_are_weights_reshaped)
+ _weights_reshaped.allocator()->allocate();
+
+ //Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+
+ if(_is_activationlayer_enabled)
{
- _weights_reshaped.allocator()->allocate();
+ _activationlayer_function.configure(output, nullptr, act_info);
}
+
+ ARM_COMPUTE_UNUSED(weights_info);
}
void GCConvolutionLayer::run()
{
// Run weights reshaping (Runs once for every configure)
- if(!_are_weights_reshaped)
+ if(_is_first_run)
{
- _are_weights_reshaped = true;
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
_reshape_weights.run();
+ _is_first_run = false;
+
+ // Mark original weights tensor as unused
+ _original_weights->mark_as_unused();
}
+ _memory_group.acquire();
+
// Run im2col
GCScheduler::get().dispatch(_fill_border);
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(_input_im2col_kernel);
- if(!_is_fully_connected_convolution)
- {
- GCScheduler::get().memory_barrier();
- // Run interleave4x4
- GCScheduler::get().dispatch(_input_interleave_kernel);
- }
-
- GCScheduler::get().memory_barrier();
- // Runs matrix multiply on reshaped matrices
- GCScheduler::get().dispatch(_mm_kernel);
+ // Run gemm on reshaped matrices
+ _mm_gemm.run();
GCScheduler::get().memory_barrier();
// Reshape output matrix
GCScheduler::get().dispatch(_output_col2im_kernel, false);
+
+ _memory_group.release();
+
+ GCScheduler::get().memory_barrier();
+ // Run Activation Layer
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
index 9cba371..7121654 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
@@ -35,10 +35,10 @@
{
}
-void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info)
+void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
{
auto k = arm_compute::support::cpp14::make_unique<GCDepthwiseConvolutionLayer3x3Kernel>();
- k->configure(input, weights, biases, output, conv_info);
+ k->configure(input, weights, biases, output, conv_info, depth_multiplier);
_kernel = std::move(k);
// Configure border handler
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
index a2607d4..c0cf098 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
@@ -39,26 +39,27 @@
{
}
-void GCDirectConvolutionLayer::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info)
+void GCDirectConvolutionLayer::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
{
int kernel_size = weights->info()->dimension(0);
if(kernel_size == 1)
{
auto k = arm_compute::support::cpp14::make_unique<GCDirectConvolutionLayer1x1Kernel>();
- k->configure(input, weights, biases, output, conv_info);
+ k->configure(input, weights, biases, output, conv_info, act_info);
_kernel = std::move(k);
}
else if(kernel_size == 3)
{
auto k = arm_compute::support::cpp14::make_unique<GCDirectConvolutionLayer3x3Kernel>();
- k->configure(input, weights, biases, output, conv_info);
+ k->configure(input, weights, biases, output, conv_info, act_info);
_kernel = std::move(k);
}
else if(kernel_size == 5)
{
auto k = arm_compute::support::cpp14::make_unique<GCDirectConvolutionLayer5x5Kernel>();
- k->configure(input, weights, biases, output, conv_info);
+ k->configure(input, weights, biases, output, conv_info, act_info);
_kernel = std::move(k);
}
else
@@ -79,4 +80,6 @@
GCScheduler::get().dispatch(_border_handler, false);
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(*_kernel);
+ GCScheduler::get().memory_barrier();
+ GCScheduler::get().dispatch(_shift_handler);
}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
index 9e4f0f6..a300033 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
@@ -38,9 +38,9 @@
_kernel = std::move(k);
}
-GCFullyConnectedLayer::GCFullyConnectedLayer()
- : _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true),
- _accumulate_biases(false)
+GCFullyConnectedLayer::GCFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(),
+ _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false)
{
}
@@ -61,6 +61,7 @@
_im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt));
// Configure im2col kernel
+ _memory_group.manage(&_im2col_output);
_im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
// Configure matrix multiply kernel
@@ -78,7 +79,8 @@
_mm_kernel.configure(input, weights, output, 1.0f, false);
}
-void GCFullyConnectedLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, bool transpose_weights, bool are_weights_reshaped)
+void GCFullyConnectedLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output,
+ bool transpose_weights, bool are_weights_reshaped, bool retain_internal_weights)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
@@ -140,11 +142,14 @@
}
// Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
- if(!_are_weights_reshaped)
+ if(!_are_weights_reshaped && !retain_internal_weights)
{
// Allocate the tensor for the weights reshaped
_reshape_weights_output.allocator()->allocate();
}
+
+ ARM_COMPUTE_ERROR_ON(retain_internal_weights && _reshape_weights_output.gc_buffer() == 0);
+ _are_weights_reshaped = _are_weights_reshaped || retain_internal_weights;
}
void GCFullyConnectedLayer::run()
@@ -156,6 +161,8 @@
_reshape_weights_kernel.run();
}
+ _memory_group.acquire();
+
// Linearize input if it comes from a convolutional layer
if(_is_fc_after_conv)
{
@@ -177,4 +184,6 @@
GCScheduler::get().dispatch(_accumulate_biases_kernel);
}
+
+ _memory_group.release();
}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
index 5122c20..79f8f71 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
@@ -38,59 +38,90 @@
#include "arm_compute/runtime/ITensorAllocator.h"
using namespace arm_compute;
-using namespace arm_compute::gles_compute;
-GCGEMM::GCGEMM()
- : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false)
+namespace
+{
+Status validate_arguments(const ITensorInfo *a, const ITensorInfo *b, const IGCTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info = GEMMInfo())
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+ ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+ ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+ if(c != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c->info());
+ ARM_COMPUTE_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+ ARM_COMPUTE_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
+ }
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != output->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != output->dimension(1), "The output matrix must have the same number of rows as the matrix A");
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_UNUSED(beta);
+ ARM_COMPUTE_UNUSED(gemm_info);
+ return Status{};
+}
+} // namespace
+
+GCGEMM::GCGEMM(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false),
+ _is_first_run(true), _reshape_b_only_on_first_run(false)
{
}
void GCGEMM::configure(const IGCTensor *a, const IGCTensor *b, const IGCTensor *c, IGCTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
- ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
- ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
- ARM_COMPUTE_ERROR_ON_MSG(gemm_info.reshape_b_only_on_first_run(), "Reshape matrix B only on first run is not supported");
- ARM_COMPUTE_UNUSED(gemm_info);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
- if(c != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
- ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
- ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
- ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
- }
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(a->info(), b->info(), c, output->info(), alpha, beta, gemm_info));
- ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-
- // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
- _is_interleaved_transposed = a->info()->dimension(1) > 16;
+ // Check if we need to reshape the matrix B only on the first run
+ _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
const IGCTensor *matrix_a = a;
const IGCTensor *matrix_b = b;
+ // Get the GPU target
+ const GPUTarget gpu_target = GCScheduler::get().get_target();
+
+ // Set the target for the kernels
+ _interleave_kernel.set_target(gpu_target);
+ _mm_kernel.set_target(gpu_target);
+
+ // Arguments used by GEMMReshapeInfo
+ // If we pass the matrix A and matrix B reshaped to GCGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to GCGEMMReshapeInfo
+ // in order to know how the matrices have been reshaped
+ const int m = a->info()->dimension(1);
+ const int n = b->info()->dimension(0);
+ const int k = a->info()->dimension(0);
+ int mult_transpose1xW_width = 1;
+ int mult_interleave4x4_height = 1;
+
+ // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
+ _is_interleaved_transposed = a->info()->dimension(1) > 16;
+
if(_is_interleaved_transposed)
{
matrix_a = &_tmp_a;
matrix_b = &_tmp_b;
- TensorShape shape_tmp_a = a->info()->tensor_shape();
- TensorShape shape_tmp_b = b->info()->tensor_shape();
-
- shape_tmp_a.set(0, a->info()->dimension(0) * 4);
- shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
-
- const unsigned int transpose_w = max_gc_vector_width / data_size_from_type(b->info()->data_type());
- shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
- shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
-
- TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
- _tmp_a.allocator()->init(info_a);
-
- TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), b->info()->fixed_point_position());
- _tmp_b.allocator()->init(info_b);
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp_a);
+ if(!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_tmp_b);
+ }
+ // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
// Configure interleave kernel
_interleave_kernel.configure(a, &_tmp_a);
@@ -99,7 +130,7 @@
_transpose_kernel.configure(b, &_tmp_b);
}
- _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed);
+ _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height));
if(_is_interleaved_transposed)
{
@@ -116,15 +147,31 @@
}
}
+Status GCGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const IGCTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(a, b, c, output, alpha, beta, gemm_info));
+ return Status{};
+}
+
void GCGEMM::run()
{
+ _memory_group.acquire();
if(_is_interleaved_transposed)
{
// Run interleave kernel
GCScheduler::get().dispatch(_interleave_kernel, false);
- // Run transpose kernel
- GCScheduler::get().dispatch(_transpose_kernel, false);
+ if(_is_first_run)
+ {
+ // Run transpose kernel
+ GCScheduler::get().dispatch(_transpose_kernel, false);
+ _is_first_run = false;
+ }
+ else if(!_reshape_b_only_on_first_run)
+ {
+ // Run transpose kernel
+ GCScheduler::get().dispatch(_transpose_kernel, false);
+ }
GCScheduler::get().memory_barrier();
}
@@ -137,4 +184,5 @@
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(_ma_kernel);
}
+ _memory_group.release();
}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
index fc3882d..b2e69ee 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,8 +33,8 @@
using namespace arm_compute;
-GCNormalizationLayer::GCNormalizationLayer()
- : _squared_input(), _norm_kernel(), _multiply_kernel(), _border_handler()
+GCNormalizationLayer::GCNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _squared_input(), _norm_kernel(), _multiply_kernel(), _border_handler()
{
}
@@ -43,6 +43,7 @@
ARM_COMPUTE_ERROR_ON(input == nullptr);
_squared_input.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, input->info()->data_type()));
+ _memory_group.manage(&_squared_input);
_norm_kernel.configure(input, &_squared_input, output, norm_info);
_multiply_kernel.configure(input, input, &_squared_input, 1.0f);
@@ -55,9 +56,13 @@
void GCNormalizationLayer::run()
{
+ _memory_group.acquire();
+
GCScheduler::get().dispatch(_multiply_kernel, false);
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(_border_handler, false);
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(_norm_kernel, true);
+
+ _memory_group.release();
}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
index 5221c5c..1748a59 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,8 +29,8 @@
using namespace arm_compute;
-GCSoftmaxLayer::GCSoftmaxLayer()
- : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
+GCSoftmaxLayer::GCSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
{
}
@@ -50,6 +50,11 @@
_max.allocator()->init(tensor_info_max_sum);
_sum.allocator()->init(tensor_info_max_sum);
+ // Manage intermediate buffers
+ _memory_group.manage(&_tmp);
+ _memory_group.manage(&_max);
+ _memory_group.manage(&_sum);
+
// Configure Kernels
_max_kernel.configure(input, &_max);
_shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
@@ -63,9 +68,13 @@
void GCSoftmaxLayer::run()
{
+ _memory_group.acquire();
+
GCScheduler::get().dispatch(_max_kernel, false);
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(_shift_exp_sum_kernel, false);
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(_norm_kernel);
+
+ _memory_group.release();
}
diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index 583cb40..54a2bd2 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp
@@ -23,202 +23,20 @@
*/
#include "arm_compute/runtime/IScheduler.h"
-#include <array>
-#include <cstdlib>
-#include <cstring>
-#include <fcntl.h>
-#include <fstream>
-#include <map>
-#include <sched.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#ifndef BARE_METAL
-#include <regex>
-#include <thread>
-#endif /* BARE_METAL */
-
-namespace
-{
-unsigned int get_threads_hint()
-{
- unsigned int num_threads_hint = 1;
-
-#ifndef BARE_METAL
- std::map<std::string, unsigned int> cpu_part_occurrence_map;
-
- // CPU part regex
- std::regex cpu_part_rgx(R"(.*CPU part.+?(?=:).+?(?=\w+)(\w+).*)");
- std::smatch cpu_part_match;
-
- // Read cpuinfo and get occurrence of each core
- std::ifstream cpuinfo;
- cpuinfo.open("/proc/cpuinfo", std::ios::in);
- if(cpuinfo.is_open())
- {
- std::string line;
- while(bool(getline(cpuinfo, line)))
- {
- if(std::regex_search(line.cbegin(), line.cend(), cpu_part_match, cpu_part_rgx))
- {
- std::string cpu_part = cpu_part_match[1];
- if(cpu_part_occurrence_map.find(cpu_part) != cpu_part_occurrence_map.end())
- {
- cpu_part_occurrence_map[cpu_part]++;
- }
- else
- {
- cpu_part_occurrence_map[cpu_part] = 1;
- }
- }
- }
- }
-
- // Get min number of threads
- auto min_common_cores = std::min_element(cpu_part_occurrence_map.begin(), cpu_part_occurrence_map.end(),
- [](const std::pair<std::string, unsigned int> &p1, const std::pair<std::string, unsigned int> &p2)
- {
- return p1.second < p2.second;
- });
-
- // Set thread hint
- num_threads_hint = cpu_part_occurrence_map.empty() ? std::thread::hardware_concurrency() : min_common_cores->second;
-#endif /* BARE_METAL */
-
- return num_threads_hint;
-}
-
-unsigned int get_cpu_impl()
-{
-#ifndef BARE_METAL
- int fd = open("/proc/cpuinfo", 0); // NOLINT
- std::array<char, 3000> buff{ {} };
- char *pos = nullptr;
- char *end = nullptr;
- bool foundid = false;
-
- int cpu = sched_getcpu();
-
- if(fd == -1)
- {
- return 0;
- }
-
- int charsread = read(fd, buff.data(), 3000);
- pos = buff.data();
- end = buff.data() + charsread;
-
- close(fd);
-
- /* So, to date I've encountered two formats for /proc/cpuinfo.
- *
- * One of them just lists processor : n for each processor (with no
- * other info), then at the end lists part information for the current
- * CPU.
- *
- * The other has an entire clause (including part number info) for each
- * CPU in the system, with "processor : n" headers.
- *
- * We can cope with either of these formats by waiting to see
- * "processor: n" (where n = our CPU ID), and then looking for the next
- * "CPU part" field.
- */
- while(pos < end)
- {
- if(foundid && strncmp(pos, "CPU part", 8) == 0)
- {
- /* Found part number */
- pos += 11;
-
- for(char *ch = pos; ch < end; ch++)
- {
- if(*ch == '\n')
- {
- *ch = '\0';
- break;
- }
- }
-
- return strtoul(pos, nullptr, 0);
- }
-
- if(strncmp(pos, "processor", 9) == 0)
- {
- /* Found processor ID, see if it's ours. */
- pos += 11;
-
- for(char *ch = pos; ch < end; ch++)
- {
- if(*ch == '\n')
- {
- *ch = '\0';
- break;
- }
- }
-
- int num = strtol(pos, nullptr, 0);
-
- if(num == cpu)
- {
- foundid = true;
- }
- }
-
- while(pos < end)
- {
- char ch = *pos++;
- if(ch == '\n' || ch == '\0')
- {
- break;
- }
- }
- }
-#endif /* BARE_METAL */
-
- return 0;
-}
-} // namespace
+#include "arm_compute/runtime/CPUUtils.h"
namespace arm_compute
{
IScheduler::IScheduler()
+ : _cpu_info()
{
// Work out the best possible number of execution threads
_num_threads_hint = get_threads_hint();
-
- // Work out the CPU implementation
- switch(get_cpu_impl())
- {
- case 0xd0f:
- _info.CPU = CPUTarget::A55_DOT;
- break;
- case 0xd03:
- _info.CPU = CPUTarget::A53;
- break;
- default:
-#ifdef __arm__
- _info.CPU = CPUTarget::ARMV7;
-#elif __aarch64__
- _info.CPU = CPUTarget::ARMV8;
-#else /* __arm__ || __aarch64__ */
- _info.CPU = CPUTarget::INTRINSICS;
-#endif /* __arm__ || __aarch64__ */
- break;
- }
-
- _info.L1_size = 31000;
- _info.L2_size = 500000;
}
-void IScheduler::set_target(CPUTarget target)
+CPUInfo &IScheduler::cpu_info()
{
- _info.CPU = target;
-}
-
-CPUInfo IScheduler::cpu_info() const
-{
- return _info;
+ return _cpu_info;
}
unsigned int IScheduler::num_threads_hint() const
diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp
index 2c64475..faaff8a 100644
--- a/src/runtime/ISimpleLifetimeManager.cpp
+++ b/src/runtime/ISimpleLifetimeManager.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,7 +37,7 @@
using namespace arm_compute;
ISimpleLifetimeManager::ISimpleLifetimeManager()
- : _active_group(nullptr), _active_elements(), _finalized_groups()
+ : _active_group(nullptr), _active_elements(), _free_blobs(), _occupied_blobs(), _finalized_groups()
{
}
@@ -53,14 +53,21 @@
void ISimpleLifetimeManager::start_lifetime(void *obj)
{
ARM_COMPUTE_ERROR_ON(obj == nullptr);
- ARM_COMPUTE_ERROR_ON_MSG(std::find_if(std::begin(_active_elements), std::end(_active_elements), [&obj](const Element & e)
+ ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements), "Memory object is already registered!");
+
+ // Check if there is a free blob
+ if(_free_blobs.empty())
{
- return obj == e.id;
- }) != std::end(_active_elements),
- "Memory object is already registered!");
+ _occupied_blobs.emplace_front(Blob{ obj, 0, { obj } });
+ }
+ else
+ {
+ _occupied_blobs.splice(std::begin(_occupied_blobs), _free_blobs, std::begin(_free_blobs));
+ _occupied_blobs.front().id = obj;
+ }
// Insert object in groups and mark its finalized state to false
- _active_elements.emplace_back(obj);
+ _active_elements.insert(std::make_pair(obj, obj));
}
void ISimpleLifetimeManager::end_lifetime(void *obj, void **handle, size_t size)
@@ -68,36 +75,50 @@
ARM_COMPUTE_ERROR_ON(obj == nullptr);
// Find object
- auto it = std::find_if(std::begin(_active_elements), std::end(_active_elements), [&obj](const Element & e)
- {
- return obj == e.id;
- });
- ARM_COMPUTE_ERROR_ON(it == std::end(_active_elements));
+ auto active_object_it = _active_elements.find(obj);
+ ARM_COMPUTE_ERROR_ON(active_object_it == std::end(_active_elements));
// Update object fields and mark object as complete
- it->handle = handle;
- it->size = size;
- it->status = true;
+ Element &el = active_object_it->second;
+ el.handle = handle;
+ el.size = size;
+ el.status = true;
+
+ // Find object in the occupied lists
+ auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), [&obj](const Blob & b)
+ {
+ return obj == b.id;
+ });
+ ARM_COMPUTE_ERROR_ON(occupied_blob_it == std::end(_occupied_blobs));
+
+ // Update occupied blob and return as free
+ occupied_blob_it->bound_elements.insert(obj);
+ occupied_blob_it->max_size = std::max(occupied_blob_it->max_size, size);
+ occupied_blob_it->id = nullptr;
+ _free_blobs.splice(std::begin(_free_blobs), _occupied_blobs, occupied_blob_it);
// Check if all object are finalized and reset active group
if(are_all_finalized())
{
- // Update finalized groups
- _finalized_groups[_active_group].insert(std::end(_finalized_groups[_active_group]), std::begin(_active_elements), std::end(_active_elements));
+ ARM_COMPUTE_ERROR_ON(!_occupied_blobs.empty());
// Update blobs and group mappings
update_blobs_and_mappings();
+ // Update finalized groups
+ _finalized_groups[_active_group] = std::move(_active_elements);
+
// Reset state
_active_elements.clear();
_active_group = nullptr;
+ _free_blobs.clear();
}
}
bool ISimpleLifetimeManager::are_all_finalized() const
{
- return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const Element e)
+ return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const std::pair<void *, Element> &e)
{
- return !e.status;
+ return !e.second.status;
});
}
diff --git a/src/runtime/Memory.cpp b/src/runtime/Memory.cpp
index 35d0c82..15bbb17 100644
--- a/src/runtime/Memory.cpp
+++ b/src/runtime/Memory.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,40 +23,45 @@
*/
#include "arm_compute/runtime/Memory.h"
-#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/MemoryRegion.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
Memory::Memory()
- : _memory(nullptr), _memory_owned(nullptr)
+ : _region(nullptr), _region_owned(nullptr)
{
+ create_empty_region();
}
-Memory::Memory(std::shared_ptr<uint8_t> memory)
- : _memory(nullptr), _memory_owned(std::move(memory))
+Memory::Memory(std::shared_ptr<IMemoryRegion> memory)
+ : _region(nullptr), _region_owned(std::move(memory))
{
- ARM_COMPUTE_ERROR_ON(_memory_owned.get() == nullptr);
- _memory = _memory_owned.get();
+ if(_region_owned == nullptr)
+ {
+ create_empty_region();
+ }
+ _region = _region_owned.get();
}
-Memory::Memory(uint8_t *memory)
- : _memory(memory), _memory_owned(nullptr)
+Memory::Memory(IMemoryRegion *memory)
+ : _region(memory), _region_owned(nullptr)
{
- ARM_COMPUTE_ERROR_ON(memory == nullptr);
+ _region = memory;
}
-uint8_t *Memory::buffer()
+IMemoryRegion *Memory::region()
{
- return _memory;
+ return _region;
}
-uint8_t *Memory::buffer() const
+IMemoryRegion *Memory::region() const
{
- return _memory;
+ return _region;
}
-uint8_t **Memory::handle()
+void Memory::create_empty_region()
{
- ARM_COMPUTE_ERROR_ON(_memory_owned.get() != nullptr);
- return &_memory;
-}
\ No newline at end of file
+ _region_owned = std::make_shared<MemoryRegion>(0);
+ _region = _region_owned.get();
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
new file mode 100644
index 0000000..b5b159a
--- /dev/null
+++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
+
+using namespace arm_compute;
+
+NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights()
+ : _kernel()
+{
+}
+
+void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape,
+ DataLayout data_layout)
+{
+ _kernel.configure(input, output, original_input_shape, data_layout);
+}
+
+Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
+ DataLayout data_layout)
+{
+ return NEConvertFullyConnectedWeightsKernel::validate(input, output, original_input_shape, data_layout);
+}
+
+void NEConvertFullyConnectedWeights::run()
+{
+ NEScheduler::get().schedule(&_kernel, Window::DimZ);
+}
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 0a49158..7053c7e 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -30,41 +30,44 @@
#include <cmath>
#include <tuple>
+#include <utility>
namespace arm_compute
{
-NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_manager(std::move(memory_manager)), _function()
+NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) //NOLINT
+ : _memory_manager(std::move(memory_manager)),
+ _function()
{
}
-void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+ const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info));
+ ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
+ enable_fast_math));
- switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
- weights_info))
+ switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info))
{
case ConvolutionMethod::WINOGRAD:
{
- auto f = arm_compute::support::cpp14::make_unique<NEWinogradLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info);
+ auto f = arm_compute::support::cpp14::make_unique<NEWinogradConvolutionLayer>(_memory_manager);
+ f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
_function = std::move(f);
break;
}
case ConvolutionMethod::GEMM:
{
auto f = arm_compute::support::cpp14::make_unique<NEGEMMConvolutionLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info, weights_info);
+ f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info);
_function = std::move(f);
break;
}
case ConvolutionMethod::DIRECT:
{
auto f = arm_compute::support::cpp14::make_unique<NEDirectConvolutionLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info);
+ f->configure(input, weights, biases, output, conv_info, act_info);
_function = std::move(f);
break;
}
@@ -75,21 +78,21 @@
}
Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info)
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
{
- switch(NEConvolutionLayer::get_convolution_method(input, weights, biases, output, conv_info, weights_info))
+ switch(NEConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info))
{
case ConvolutionMethod::WINOGRAD:
//Validate Winograd
- NEWinogradLayer::validate(input, weights, biases, output, conv_info);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
break;
case ConvolutionMethod::GEMM:
//Validate Gemm-based Convolution
- NEGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info));
break;
case ConvolutionMethod::DIRECT:
//Validate Gemm-based Convolution
- NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
default:
ARM_COMPUTE_ERROR("Not supported.");
break;
@@ -98,17 +101,20 @@
return Status{};
}
-ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info)
+ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
{
- ARM_COMPUTE_UNUSED(output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
ARM_COMPUTE_UNUSED(weights_info);
- if((input->data_type() == DataType::F32) && (weights->dimension(0) == 3) && (weights->dimension(1) == 3) && (weights->num_dimensions() <= 4) && (conv_info.stride().first == 1)
- && (conv_info.stride().second == 1) && (biases != nullptr))
+
+ if(dilation != Size2D(1U, 1U) || Scheduler::get().cpu_info().get_cpu_model() == CPUModel::A53
+ || input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) <= 16)
{
- return ConvolutionMethod::WINOGRAD;
+ return ConvolutionMethod::GEMM;
}
- return ConvolutionMethod::GEMM;
+
+ return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
}
void NEConvolutionLayer::run()
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index c1ba5dd..40ada8f 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -34,6 +34,7 @@
NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
: _memory_group(std::move(memory_manager)),
_conv_f(),
+ _upsample_f(),
_scaled_output(),
_input(nullptr),
_info(),
@@ -41,13 +42,64 @@
{
}
+Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info,
+ unsigned int inner_border_right, unsigned int inner_border_top)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
+
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_right > stride_x - 1, "inner_border_right must be smaller than stride_x");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_top > stride_y - 1, "inner_border_top must be smaller than stride_y");
+
+ auto out_dims = deconvolution_output_dimensions(input->dimension(0), input->dimension(1), weights->dimension(0), weights->dimension(1),
+ info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights, bias);
+
+ if(bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, bias);
+ }
+
+ if(output->tensor_shape().total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+ const TensorShape output_shape = deconvolution_output_shape(out_dims, input->tensor_shape(), weights->tensor_shape());
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+ }
+
+ TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_deconvolution_shape(*input, stride_x, stride_y, inner_border_right, inner_border_top,
+ info)));
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+ for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != scale_out_info.dimension(i));
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo()));
+
+ return Status{};
+}
+
void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info,
unsigned int inner_border_right, unsigned int inner_border_top)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
- ARM_COMPUTE_ERROR_ON(!info.padding_is_symmetric());
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
_input = input;
_info = info;
@@ -55,15 +107,9 @@
const unsigned int stride_x = info.stride().first;
const unsigned int stride_y = info.stride().second;
- auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
- info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
- const TensorShape output_shape = deconvolution_output_shape(out_dims, input->info()->tensor_shape(), weights->info()->tensor_shape());
-
- ARM_COMPUTE_UNUSED(output_shape);
- ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
- ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
- ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top));
_memory_group.manage(&_scaled_output);
@@ -79,44 +125,20 @@
// Allocate auxiliary tensors
_scaled_output.allocator()->allocate();
+
+ // configure upsample function
+ _upsample_f.configure(input, &_scaled_output, info, inner_border_right, inner_border_top);
}
void NEDeconvolutionLayer::run()
{
_memory_group.acquire();
- // Initialize _scaled_output buffer
- const int width_in = _input->info()->dimension(0);
- const int height_in = _input->info()->dimension(1);
- const int width_scaled = _scaled_output.info()->dimension(0);
- const int height_scaled = _scaled_output.info()->dimension(1);
- const int num_2d_slices = _input->info()->tensor_shape().total_size() / (width_in * height_in);
- const int stride_x = _info.stride().first;
- const int stride_y = _info.stride().second;
-
- std::fill_n(_scaled_output.buffer(), _scaled_output.info()->total_size(), 0);
-
- // scaled_output is the input for the forward convolution. We copy the input elements to scaled_output
- // and insert rows and columns with zeroes depending on the stride values.
- for(int slice = 0; slice < num_2d_slices; ++slice)
- {
- const int start_x = _info.pad().first;
- const int start_y = _inner_border.second + _info.pad().second;
- const int end_y = height_scaled - _info.pad().second;
- const int end_x = width_scaled - _inner_border.first - _info.pad().first;
-
- for(int yi = start_y, in_y = 0; yi < end_y; yi += stride_y, in_y++)
- {
- for(int xi = start_x, in_x = 0; xi < end_x; xi += stride_x, in_x++)
- {
- const auto in = *(reinterpret_cast<float *>(_input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(in_x, in_y, slice))));
- *(reinterpret_cast<float *>(_scaled_output.buffer() + _scaled_output.info()->offset_element_in_bytes(Coordinates(xi, yi, slice)))) = in;
- }
- }
- }
+ // Run upsample kernel
+ _upsample_f.run();
// Run convolution layer
_conv_f.run();
_memory_group.release();
-}
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 95fcf88..0a977ad 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -37,11 +37,11 @@
NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
: _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _accumulator(), _input_nhwc(), _weights_hwio(), _output_nhwc(), _has_bias(false),
- _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false)
+ _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true)
{
}
-void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
@@ -52,30 +52,39 @@
_has_bias = biases != nullptr;
_is_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(),
conv_info,
- input->info()->data_type());
+ input->info()->data_type(),
+ depth_multiplier,
+ input->info()->data_layout());
_are_weights_reshaped = false;
+ _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
+
+ ARM_COMPUTE_ERROR_ON(!_is_optimized && !_is_nchw);
if(_is_optimized)
{
- // Configure the function to transform the input tensor from NCHW -> NHWC
- _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+ if(_is_nchw)
+ {
+ // Configure the function to transform the input tensor from NCHW -> NHWC
+ _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
- // Configure the function to transform the weights tensor from IHW -> HWI
- _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
+ // Configure the function to transform the weights tensor from IHW -> HWI
+ _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
- // Configure optimized depthwise
- _dwc_kernel.configure(&_input_nhwc, &_weights_hwio, &_output_nhwc, conv_info, DataLayout::NHWC);
+ // Configure optimized depthwise
+ _dwc_kernel.configure(&_input_nhwc, &_weights_hwio, &_output_nhwc, conv_info, depth_multiplier, DataLayout::NHWC);
- // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
- _permute_output.configure(&_output_nhwc, output, PermutationVector(1U, 2U, 0U));
+ // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+ _permute_output.configure(&_output_nhwc, output, PermutationVector(1U, 2U, 0U));
- // Allocate tensors
- _input_nhwc.allocator()->allocate();
- _weights_hwio.allocator()->allocate();
- _output_nhwc.allocator()->allocate();
-
- // Create convolver (deferred)
- _dwc_kernel.generate_convolver();
+ // Allocate tensors
+ _input_nhwc.allocator()->allocate();
+ _weights_hwio.allocator()->allocate();
+ _output_nhwc.allocator()->allocate();
+ }
+ else
+ {
+ _dwc_kernel.configure(input, weights, output, conv_info, depth_multiplier, DataLayout::NHWC);
+ }
}
else
{
@@ -88,7 +97,7 @@
}
// Configure depthwise convolution kernel
- _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info);
+ _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier);
// Configure border handler
_border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
@@ -116,8 +125,15 @@
void NEDepthwiseConvolutionLayer3x3::run()
{
+ if(_is_first_run && _is_optimized)
+ {
+ _is_first_run = false;
+ // Create convolver (deferred)
+ _dwc_kernel.generate_convolver();
+ }
+
// Permute weights in HWIO format if the optimized kernel will be executedd
- if(!_are_weights_reshaped && _is_optimized)
+ if(!_are_weights_reshaped && _is_optimized && _is_nchw)
{
_are_weights_reshaped = true;
_permute_weights.run();
@@ -126,8 +142,11 @@
// Handle input
if(_is_optimized)
{
- // Permute input to NHWC format execution
- _permute_input.run();
+ if(_is_nchw)
+ {
+ // Permute input to NHWC format execution
+ _permute_input.run();
+ }
}
else
{
@@ -139,7 +158,7 @@
NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
// Permute output to ACL's native NCHW format in case of NHWC execution
- if(_is_optimized)
+ if(_is_optimized && _is_nchw)
{
_permute_output.run();
}
@@ -153,31 +172,37 @@
NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
: _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
- _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_quantized(false)
+ _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr)
{
}
-void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON((input->info()->dimension(2) * depth_multiplier) != weights->info()->dimension(2));
const size_t weights_w = weights->info()->dimension(0);
const size_t weights_h = weights->info()->dimension(1);
const size_t weights_z = weights->info()->dimension(2);
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _is_first_run = true;
+ _original_weights = weights;
// Should bias be appended ?
bool append_bias = (biases != nullptr) && !_is_quantized;
// Calculate output shape
- TensorShape dwc_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info);
+ TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
// Output width and height
- const unsigned int conv_w = dwc_output_shape.x();
- const unsigned int conv_h = dwc_output_shape.y();
+ const unsigned int conv_w = output_shape.x();
+ const unsigned int conv_h = output_shape.y();
// Set up intermediate tensors
const size_t patch_size = weights_w * weights_h + (append_bias ? 1 : 0);
@@ -189,7 +214,7 @@
shape_im2col.set(1, conv_size);
shape_im2col.set(2, weights_z);
_input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
- _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias);
+ _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
// Weights reshape configuration
const TensorShape shape_weights_reshape(patch_size, weights_z);
@@ -204,7 +229,7 @@
shape_v2mm_out.set(2, 1);
_v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
_v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
- _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dwc_output_shape));
+ _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
_vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
// Output staged configuration
@@ -241,10 +266,21 @@
void NEDepthwiseConvolutionLayer::run()
{
+ // Run weights reshaping (Runs once for every configure)
+ if(_is_first_run)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
+ NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
+ _is_first_run = false;
+
+ // Mark original weights tensor as unused
+ _original_weights->mark_as_unused();
+ }
+
NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
- NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
NEScheduler::get().schedule(&_v2mm_input_fill_border, Window::DimX);
- NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX);
NEScheduler::get().schedule(&_vector_to_tensor_kernel, Window::DimX);
if(_is_quantized)
diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
index a58b6e4..0627977 100644
--- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
using namespace arm_compute;
@@ -34,8 +35,18 @@
{
}
+Status NEDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(input, output, min_max));
+
+ return Status{};
+}
+
void NEDequantizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
+
// Configure kernel
_dequantize_kernel.configure(input, output, min_max);
}
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index c26c99a..445864c 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,18 +34,23 @@
using namespace arm_compute;
NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _accumulator(), _has_bias(false), _is_fixed_point(false)
+ : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false), _is_fixed_point(false),
+ _is_activationlayer_enabled(false), _dim_split(Window::DimZ)
{
}
-void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info)
+void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
{
+ ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
+
// Free accumulator
if(_accumulator.buffer() != nullptr)
{
_accumulator.allocator()->free();
}
+ _dim_split = input->info()->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY;
+
// Check if bias should be added in the convolution result
_has_bias = (bias != nullptr);
@@ -73,9 +78,17 @@
// Add zero padding XY
_input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+
+ //Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
}
-Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info)
+Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
@@ -101,6 +114,11 @@
// Validate bias kernel
ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, bias, output));
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
+ }
+
return Status{};
}
@@ -110,10 +128,15 @@
_memory_group.acquire();
- NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
+ NEScheduler::get().schedule(&_conv_kernel, _dim_split);
if(_has_bias || _is_fixed_point)
{
NEScheduler::get().schedule(&_output_stage_kernel, Window::DimY);
}
+
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
_memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 26b7271..958d081 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -132,7 +132,7 @@
NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(),
- _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false)
+ _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false), _original_weights(nullptr)
{
}
@@ -163,6 +163,7 @@
const int num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions;
const size_t linear_input_size = input->info()->tensor_shape().total_size_lower(num_input_dimensions);
+ _original_weights = weights;
_linearize_input = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1);
_are_weights_reshaped = are_weights_reshaped;
_accumulate_biases = biases != nullptr;
@@ -187,7 +188,7 @@
if(_linearize_input)
{
- _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_shape(input->info(), num_input_dimensions)));
+ _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input->info(), num_input_dimensions)));
// Configure im2col kernel
_memory_group.manage(&_im2col_output);
@@ -287,7 +288,7 @@
if(linearize_input)
{
- im2col_output->set_tensor_shape(compute_im2col_shape(input, num_input_dimensions));
+ im2col_output->set_tensor_shape(compute_im2col_fc_shape(input, num_input_dimensions));
ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, im2col_output.get(), Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, true));
@@ -324,8 +325,13 @@
// Reshape of the weights (happens only once)
if(!_are_weights_reshaped)
{
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
_are_weights_reshaped = true;
_reshape_weights_kernel.run();
+
+ // Mark original weights tensor as unused
+ _original_weights->mark_as_unused();
}
_memory_group.acquire();
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 05907ba..9168ed4 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -26,37 +26,20 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/AssemblyHelper.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/TensorAllocator.h"
#include "support/ToolchainSupport.h"
-namespace arm_compute
-{
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wswitch-default"
-#pragma GCC diagnostic ignored "-Weffc++"
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp"
-#pragma GCC diagnostic pop
-} // namespace arm_compute
-
#include <cmath>
namespace arm_compute
{
NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _mm_optimised_kernel(nullptr), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(),
+ : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(), _B_pretransposed(),
_run_vector_matrix_multiplication(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
{
}
@@ -83,41 +66,14 @@
_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+ const bool run_optimised = a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f)
+ && setup_assembly_kernel(a, b, d, alpha, beta, _reshape_b_only_on_first_run, _workspace, _B_pretransposed, _memory_group, _asm_glue);
+
// Check if the first input tensor is a vector.
// If so, all the kernels for reshaping the tensors can be skipped
if(_run_vector_matrix_multiplication)
{
-#if defined(__aarch64__)
- if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
- {
- _mm_optimised_kernel = support::cpp14::make_unique<NEGEMVAArch64Kernel>();
- }
-
- if(_mm_optimised_kernel != nullptr)
- {
- struct CPUInfo ci = NEScheduler::get().cpu_info();
-
- const int N = d->info()->tensor_shape().x();
- const int K = a->info()->tensor_shape().x();
-
- size_t workbench_size = 0;
-
- if(a->info()->data_type() == DataType::F32)
- {
- workbench_size = GemvTransposed<sgemv_trans, sgemv_trans::operand_type, sgemv_trans::result_type>(&ci, N, K).get_working_size();
- }
-
- constexpr size_t alignment = 4096;
- ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0");
- _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8));
- _memory_group.manage(&_workspace);
-
- // Configure matrix multiplication kernel
- _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */);
- _workspace.allocator()->allocate();
- }
- else
-#endif /* defined(__aarch64__) */
+ if(!run_optimised)
{
// Configure the matrix multiply kernel
_mm_kernel.configure(a, b, d, alpha, false);
@@ -132,65 +88,7 @@
}
else
{
-#if defined(__arm__)
- if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
- {
- _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
- }
-#elif defined(__aarch64__)
- if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
- {
- _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
- }
- else if(a->info()->data_type() == DataType::F16 && (c == nullptr || beta == 0.f))
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- _mm_optimised_kernel = support::cpp14::make_unique<NEHGEMMAArch64FP16Kernel>();
-#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- }
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
-#if defined(__arm__) || defined(__aarch64__)
- if(_mm_optimised_kernel != nullptr)
- {
- struct CPUInfo ci = NEScheduler::get().cpu_info();
-
- const int M = d->info()->tensor_shape().y();
- const int N = d->info()->tensor_shape().x();
- const int K = a->info()->tensor_shape().x();
-
- size_t workbench_size = 0;
-
-#if defined(__arm__)
- workbench_size = GemmInterleaved<sgemm_8x6, sgemm_8x6::operand_type, sgemm_8x6::result_type>(&ci, M, N, K, false, false).get_working_size();
-#elif defined(__aarch64__)
- if(a->info()->data_type() == DataType::F32)
- {
- workbench_size = GemmInterleaved<sgemm_12x8, sgemm_12x8::operand_type, sgemm_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();
- }
- else if(a->info()->data_type() == DataType::F16)
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- workbench_size = GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type>(&ci, M, N, K, false, false).get_working_size();
-#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- }
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
- constexpr size_t alignment = 4096;
- ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0");
- _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8));
- _memory_group.manage(&_workspace);
-
- // Configure matrix multiplication kernel
- _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */);
- _workspace.allocator()->allocate();
- }
- else
-#endif /* defined(__arm__) || defined(__aarch64__) */
+ if(!run_optimised)
{
TensorShape shape_tmp_a = a->info()->tensor_shape();
TensorShape shape_tmp_b = b->info()->tensor_shape();
@@ -210,7 +108,10 @@
// Manage intermediate buffers
_memory_group.manage(&_tmp_a);
- _memory_group.manage(&_tmp_b);
+ if(!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_tmp_b);
+ }
int m = a->info()->dimension(1);
int n = b->info()->dimension(0);
@@ -243,9 +144,9 @@
{
_memory_group.acquire();
- if(_mm_optimised_kernel != nullptr)
+ if(_asm_glue._optimised_kernel != nullptr)
{
- NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
+ _asm_glue.run();
_memory_group.release();
}
else
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index a85078c..2888b43 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -23,9 +23,6 @@
*/
#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
-#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Utils.h"
@@ -34,13 +31,6 @@
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "support/ToolchainSupport.h"
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
-} // namespace arm_compute
-
#include <cmath>
#include <tuple>
@@ -175,19 +165,28 @@
}
}
-Status validate_and_initialize_values(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, DataType &dt,
- bool &append_bias,
+Status validate_and_initialize_values(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+ const ActivationLayerInfo &act_info, DataType &dt,
+ bool &append_bias, bool &skip_im2col,
bool &are_weights_reshaped, unsigned int &kernel_width, unsigned int &kernel_height,
- bool &is_fully_connected_convolution, bool &is_interleaved, bool &is_quantized,
+ bool &is_fully_connected_convolution, bool &is_interleaved, bool &is_quantized, bool &is_activationlayer_enabled,
unsigned int &mat_weights_cols, unsigned int &mat_weights_rows,
- unsigned int &conv_w, unsigned int &conv_h)
+ unsigned int &conv_w, unsigned int &conv_h, const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
- ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && weights->dimension(2) != input->dimension(2));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+ DataLayout data_layout = input->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && weights->dimension(idx_channel) != input->dimension(idx_channel));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON(weights_info.are_reshaped() && is_data_type_quantized_asymmetric(input->data_type()));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(data_layout == DataLayout::NHWC && input->data_type() != DataType::F32, "NHWC is only supported for FP32 data type.");
dt = input->data_type();
is_quantized = is_data_type_quantized_asymmetric(dt);
@@ -207,28 +206,32 @@
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
}
+ // If we have 1x1 convolution and data layout is NHWC we can disable im2col
append_bias = (biases != nullptr) && (!is_quantized);
are_weights_reshaped = weights_info.are_reshaped();
- kernel_width = (are_weights_reshaped) ? weights_info.kernel_size().first : weights->dimension(0);
- kernel_height = (are_weights_reshaped) ? weights_info.kernel_size().second : weights->dimension(1);
+ kernel_width = (are_weights_reshaped) ? weights_info.kernel_size().first : weights->dimension(idx_width);
+ kernel_height = (are_weights_reshaped) ? weights_info.kernel_size().second : weights->dimension(idx_height);
mat_weights_cols = weights->dimension(3);
- mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (append_bias ? 1 : 0);
+ mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel) + ((append_bias && !skip_im2col) ? 1 : 0);
+ skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1);
- std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
- conv_info);
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width), input->dimension(idx_height), kernel_width, kernel_height,
+ conv_info, dilation);
// Check if its a "fully connected" convolution
is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
is_interleaved = (!is_fully_connected_convolution && !is_quantized);
+ is_activationlayer_enabled = act_info.enabled();
return Status{};
}
} // namespace
NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
- : _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_optimised_kernel(nullptr), _mm_gemmlowp(memory_manager),
- _gemmlowp_output_stage(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _tmp_output(), _workspace(), _append_bias(false),
- _is_fully_connected_convolution(false), _are_weights_reshaped(false), _is_quantized(false), _is_interleaved(false)
+ : _asm_glue(), _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
+ _output_col2im_kernel(), _activationlayer_function(), _add_bias_kernel(), _original_weights(nullptr), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(),
+ _tmp_output(), _workspace(), _B_pretransposed(), _data_layout(DataLayout::NCHW), _append_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false), _is_quantized(false),
+ _is_interleaved(false), _is_activationlayer_enabled(false), _skip_im2col(false)
{
}
@@ -256,26 +259,8 @@
}
}
-void NEGEMMConvolutionLayer::configure_asm_mm(const struct CPUInfo &ci, int M, int N, int K)
-{
- ARM_COMPUTE_UNUSED(ci);
- ARM_COMPUTE_UNUSED(M);
- ARM_COMPUTE_UNUSED(N);
- ARM_COMPUTE_UNUSED(K);
-#if defined(__arm__) || defined(__aarch64__)
-#if defined(__arm__)
- GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
-#elif defined(__aarch64__)
- GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
- constexpr size_t alignment = 4096;
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- _memory_group.manage(&_workspace);
-#endif /* defined(__arm__) || defined(__aarch64__) */
-}
-
-void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+ const Size2D &dilation, const ActivationLayerInfo &act_info)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
@@ -288,45 +273,35 @@
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- Status status = validate_and_initialize_values(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), conv_info, weights_info, dt, _append_bias, _are_weights_reshaped,
+ _data_layout = input->info()->data_layout();
+ const bool is_nhwc = _data_layout == DataLayout::NHWC;
+ const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+
+ Status status = validate_and_initialize_values(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), conv_info, weights_info, act_info, dt, _append_bias, _skip_im2col,
+ _are_weights_reshaped,
kernel_width, kernel_height,
- _is_fully_connected_convolution, _is_interleaved, _is_quantized,
- mat_weights_cols, mat_weights_rows, conv_w, conv_h);
+ _is_fully_connected_convolution, _is_interleaved, _is_quantized, _is_activationlayer_enabled,
+ mat_weights_cols, mat_weights_rows, conv_w, conv_h, dilation);
ARM_COMPUTE_ERROR_THROW_ON(status);
+ _original_weights = weights;
const unsigned int fixed_point_position = input->info()->fixed_point_position();
const ITensor *biases_to_use = (_append_bias) ? biases : nullptr;
-#if defined(__arm__)
- if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
- {
- _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
- }
-#elif defined(__aarch64__)
- if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
- {
- _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
- }
-#endif /* defined(__arm__) || defined(__aarch64__) */
+ bool run_optimised = dt == DataType::F32;
// Reshape weights if needed
- if(_mm_optimised_kernel != nullptr)
+ if(run_optimised)
{
- if(_are_weights_reshaped)
- {
- mat_weights_cols = weights_info.num_kernels();
- mat_weights_rows = weights->info()->dimension(1);
- }
- else
- {
- TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
+ TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
- // Create tensor to store the reshaped weights
- _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
- _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
- weights = &_weights_reshaped;
- }
+ // Create tensor to store the reshaped weights
+ _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
+ _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+ weights = &_weights_reshaped;
}
else
{
@@ -335,12 +310,12 @@
if(_is_fully_connected_convolution || _is_quantized)
{
mat_weights_cols = weights_info.num_kernels();
- mat_weights_rows = weights->info()->dimension(1);
+ mat_weights_rows = weights->info()->dimension(idx_height);
}
else
{
mat_weights_cols = weights_info.num_kernels();
- mat_weights_rows = weights_info.kernel_size().first * weights_info.kernel_size().second * input->info()->dimension(2) + (_append_bias ? 1 : 0);
+ mat_weights_rows = weights_info.kernel_size().first * weights_info.kernel_size().second * input->info()->dimension(idx_channel) + (_append_bias ? 1 : 0);
}
}
else
@@ -366,66 +341,56 @@
}
}
- // Create tensor to store im2col reshaped inputs
- const unsigned int mat_input_cols = mat_weights_rows;
- const unsigned int mat_input_rows = conv_w * conv_h;
-
- TensorShape shape_im2col(input->info()->tensor_shape());
- shape_im2col.set(0, mat_input_cols);
- shape_im2col.set(1, mat_input_rows);
- shape_im2col.set(2, 1);
- _input_im2col_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
- _memory_group.manage(&_input_im2col_reshaped);
-
- // Create tensor (interleave) to prepare input tensor for GEMM
- if(!_is_fully_connected_convolution && _mm_optimised_kernel == nullptr)
+ // In case we skip im2col we have to add bias
+ if(!_skip_im2col)
{
- TensorShape shape_interleaved(shape_im2col);
- shape_interleaved.set(0, shape_interleaved.x() * 4);
- shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
- _input_interleaved_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_interleaved));
- _memory_group.manage(&_input_interleaved_reshaped);
+ const unsigned int mat_input_cols = mat_weights_rows;
+ const unsigned int mat_input_rows = conv_w * conv_h;
+
+ // Create tensor to store im2col reshaped inputs
+ TensorShape shape_im2col(input->info()->tensor_shape());
+ shape_im2col.set(0, mat_input_cols);
+ shape_im2col.set(1, mat_input_rows);
+ shape_im2col.set(2, 1);
+ _input_im2col_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+ _memory_group.manage(&_input_im2col_reshaped);
+
+ // Create tensor (interleave) to prepare input tensor for GEMM
+ if(!_is_fully_connected_convolution && !run_optimised && _is_interleaved)
+ {
+ TensorShape shape_interleaved(shape_im2col);
+ shape_interleaved.set(idx_width, shape_interleaved.x() * 4);
+ shape_interleaved.set(idx_height, std::ceil(shape_interleaved[idx_height] / 4.f));
+ _input_interleaved_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_interleaved));
+ _memory_group.manage(&_input_interleaved_reshaped);
+ }
+
+ // Create GEMM output tensor
+ TensorShape shape_gemm(_input_im2col_reshaped.info()->tensor_shape());
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, mat_input_rows);
+ const DataType gemm_data_type = _is_quantized ? DataType::S32 : dt;
+ // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
+ TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
+ info_gemm.set_quantization_info(output->info()->quantization_info());
+ _gemm_output.allocator()->init(info_gemm);
+
+ // Configure im2col
+ _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias, false, false, dilation);
+ }
+ else if(_append_bias)
+ {
+ // Configure add bias kernel
+ _add_bias_kernel.configure(output, biases, output, ConvertPolicy::SATURATE);
}
- // Create GEMM output tensor
- TensorShape shape_gemm(_input_im2col_reshaped.info()->tensor_shape());
- shape_gemm.set(0, mat_weights_cols);
- shape_gemm.set(1, mat_input_rows);
- const DataType gemm_data_type = _is_quantized ? DataType::S32 : dt;
- // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
- TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
- info_gemm.set_quantization_info(output->info()->quantization_info());
- _gemm_output.allocator()->init(info_gemm);
- _memory_group.manage(&_gemm_output);
-
- // Configure kernels
- // Configure im2col
- _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias);
-
// Configure matrix multiply
- if(_mm_optimised_kernel != nullptr)
+ if(run_optimised)
{
- struct CPUInfo ci = NEScheduler::get().cpu_info();
-
- const int M = _gemm_output.info()->tensor_shape().y();
- const int N = _gemm_output.info()->tensor_shape().x();
- const int K = _input_im2col_reshaped.info()->tensor_shape().x();
-
-#if defined(__aarch64__)
- if((N <= 128) && (K <= 128))
+ if(!setup_assembly_kernel(_skip_im2col ? input : &_input_im2col_reshaped, weights, is_nhwc ? output : &_gemm_output, 1.f, 0.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue))
{
- _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64NativeKernel>();
+ ARM_COMPUTE_ERROR("setup_assembly_kernel failed.");
}
- else
-#endif /* defined(__aarch64__) */
- {
- configure_asm_mm(ci, M, N, K);
- }
-
- // Configure matrix multiplication kernel
- _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace);
-
- _workspace.allocator()->allocate();
}
else
{
@@ -435,8 +400,8 @@
_input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
// Configure GEMM
- configure_mm(&_input_interleaved_reshaped, weights, &_gemm_output, _is_interleaved, GEMMReshapeInfo(_input_im2col_reshaped.info()->dimension(1), 0 /* no transpose */,
- _input_im2col_reshaped.info()->dimension(0)));
+ configure_mm(&_input_interleaved_reshaped, weights, &_gemm_output, _is_interleaved, GEMMReshapeInfo(_input_im2col_reshaped.info()->dimension(idx_height), 0 /* no transpose */,
+ _input_im2col_reshaped.info()->dimension(idx_width)));
_input_interleaved_reshaped.allocator()->allocate();
}
else
@@ -445,48 +410,63 @@
}
}
- _input_im2col_reshaped.allocator()->allocate();
-
- // Configure output stage for quantized case
- if(_is_quantized)
+ if(!_skip_im2col)
{
- const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+ _input_im2col_reshaped.allocator()->allocate();
- float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
- int output_multiplier, output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _memory_group.manage(&_tmp_output);
- _gemmlowp_output_stage.configure(&_gemm_output, biases, &_tmp_output, output_multiplier, output_shift, output_quant_info.offset);
+ // Configure output stage for quantized case
+ if(_is_quantized)
+ {
+ const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+ float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+ int output_multiplier, output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ _memory_group.manage(&_tmp_output);
+ _gemmlowp_output_stage.configure(&_gemm_output, biases, &_tmp_output, output_multiplier, output_shift, output_quant_info.offset);
+ }
+
+ // Configure Col2Im
+ if(!is_nhwc)
+ {
+ _output_col2im_kernel.configure(_is_quantized ? &_tmp_output : &_gemm_output, output, Size2D(conv_w, conv_h));
+ }
+
+ if(_is_quantized)
+ {
+ _tmp_output.allocator()->allocate();
+ }
+ _gemm_output.allocator()->allocate();
}
- // Configure Col2Im
- _output_col2im_kernel.configure(_is_quantized ? &_tmp_output : &_gemm_output, output, Size2D(conv_w, conv_h));
- if(_is_quantized)
- {
- _tmp_output.allocator()->allocate();
- }
- _gemm_output.allocator()->allocate();
-
- ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+ ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h), "Output shape does not match the expected one");
// Allocate intermediate tensor
if(!_are_weights_reshaped)
{
_weights_reshaped.allocator()->allocate();
}
+
+ //Configure Activation Layer
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
}
Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info)
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(output);
DataType dt{};
bool append_bias{};
+ bool skip_im2col{};
bool are_weights_reshaped{};
bool is_fully_connected_convolution{};
bool is_interleaved{};
bool is_quantized{};
+ bool is_activationlayer_enabled{};
unsigned int kernel_width = 0;
unsigned int kernel_height = 0;
unsigned int mat_weights_cols = 0;
@@ -494,9 +474,14 @@
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- Status status = validate_and_initialize_values(input, weights, biases, conv_info, weights_info, dt, append_bias, are_weights_reshaped, kernel_width, kernel_height,
- is_fully_connected_convolution, is_interleaved, is_quantized, mat_weights_cols, mat_weights_rows,
- conv_w, conv_h);
+ const DataLayout data_layout = input->data_layout();
+ const bool is_nhwc = data_layout == DataLayout::NHWC;
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ Status status = validate_and_initialize_values(input, weights, biases, conv_info, weights_info, act_info, dt, append_bias, skip_im2col, are_weights_reshaped, kernel_width, kernel_height,
+ is_fully_connected_convolution, is_interleaved, is_quantized, is_activationlayer_enabled, mat_weights_cols, mat_weights_rows,
+ conv_w, conv_h, dilation);
const Size2D kernel_weights = Size2D(kernel_width, kernel_height);
@@ -505,68 +490,11 @@
std::unique_ptr<ITensorInfo> reshaped_weights = weights->clone();
bool optimised_kernel = false;
-#if defined(__arm__)
- if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
+ if(dt == DataType::F32)
{
optimised_kernel = true;
}
-#elif defined(__aarch64__)
- if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
- {
- optimised_kernel = true;
- }
-#endif /* defined(__arm__) || defined(__aarch64__) */
- // Reshape weights if needed
- if(optimised_kernel)
- {
- if(are_weights_reshaped)
- {
- mat_weights_cols = weights_info.num_kernels();
- mat_weights_rows = weights->dimension(1);
- }
- else
- {
- TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
-
- // Create tensor to store the reshaped weights
- reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
- ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
- weights = reshaped_weights.get();
- }
- }
- else
- {
- if(are_weights_reshaped)
- {
- const unsigned int transpose_width = 16 / input->element_size();
- mat_weights_cols = weights_info.num_kernels();
- mat_weights_rows = weights->dimension(0) / transpose_width + (append_bias ? 1 : 0);
- }
- else
- {
- TensorShape reshaped_weights_shape;
-
- if(is_fully_connected_convolution || is_quantized)
- {
- reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
- }
- else
- {
- // Create tensor to store transposed weights
- const float transpose_width = 16.0f / input->element_size();
- reshaped_weights_shape = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
- static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
- }
-
- // Create tensor to store the reshaped weights
- reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
- ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
- weights = reshaped_weights.get();
- }
- }
-
- // Validate im2col
const unsigned int mat_input_cols = mat_weights_rows;
const unsigned int mat_input_rows = conv_w * conv_h;
TensorShape shape_im2col = input->tensor_shape();
@@ -574,7 +502,17 @@
shape_im2col.set(1, mat_input_rows);
shape_im2col.set(2, 1);
TensorInfo im2_col_info = input->clone()->set_tensor_shape(shape_im2col);
- ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2_col_info, kernel_weights, conv_info, append_bias, false));
+
+ if(!skip_im2col)
+ {
+ // Validate im2col
+ ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2_col_info, kernel_weights, conv_info, append_bias, false, false, dilation));
+ }
+ else if(append_bias)
+ {
+ // Validate add bias kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(output, biases, output, ConvertPolicy::SATURATE));
+ }
// Create GEMM output tensor
TensorShape shape_gemm(im2_col_info.tensor_shape());
@@ -582,19 +520,63 @@
shape_gemm.set(1, mat_input_rows);
TensorInfo gemm_output_info = input->clone()->set_tensor_shape(shape_gemm);
- // Validate GEMM interleave and multiply
- if(is_interleaved)
+ // Reshape weights if needed
+ if(optimised_kernel)
{
- TensorShape shape_interleaved = shape_im2col;
- shape_interleaved.set(0, shape_interleaved.x() * 4);
- shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
- TensorInfo input_interleaved_info = input->clone()->set_tensor_shape(shape_interleaved);
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(&im2_col_info, &input_interleaved_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&input_interleaved_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo()));
+ ARM_COMPUTE_RETURN_ERROR_ON(are_weights_reshaped);
+
+ // Create tensor to store the reshaped weights
+ reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
}
- else
+ else if(!is_quantized)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&im2_col_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo()));
+ TensorShape reshaped_weights_shape;
+
+ if(is_fully_connected_convolution || is_quantized)
+ {
+ reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
+ }
+ else
+ {
+ // Create tensor to store transposed weights
+ const float transpose_width = 16.0f / input->element_size();
+ reshaped_weights_shape = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
+ static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
+ }
+
+ // Create tensor to store the reshaped weights
+ reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
+ weights = reshaped_weights.get();
+
+ // Validate GEMM interleave and multiply
+ if(is_interleaved)
+ {
+ TensorShape shape_interleaved = shape_im2col;
+ shape_interleaved.set(idx_width, shape_interleaved.x() * 4);
+ shape_interleaved.set(idx_height, std::ceil(shape_interleaved.y() / 4.f));
+ TensorInfo input_interleaved_info = input->clone()->set_tensor_shape(shape_interleaved);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(&im2_col_info, &input_interleaved_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&input_interleaved_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo(shape_im2col[1], // m
+ weights->tensor_shape()[0], // n
+ shape_im2col[0]) /* k */));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&im2_col_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo()));
+ }
+ }
+ if(!is_nhwc)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(idx_width) != conv_w) || (output->dimension(idx_height) != conv_h), "Output shape does not match the expected one");
+
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
}
return Status{};
@@ -605,19 +587,33 @@
// Run weights reshaping (Runs once for every configure)
if(!_are_weights_reshaped)
{
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
_are_weights_reshaped = true;
_reshape_weights.run();
+
+ // Mark original weights tensor as unused
+ _original_weights->mark_as_unused();
}
_memory_group.acquire();
- // Run input reshaping
- NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
+ if(!_skip_im2col)
+ {
+ // Run input reshaping
+ unsigned int _y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ NEScheduler::get().schedule(&_input_im2col_kernel, _y_dim);
+ }
// Runs matrix multiply on reshaped matrices
- if(_mm_optimised_kernel != nullptr)
+ if(_asm_glue._optimised_kernel != nullptr)
{
- NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
+ _asm_glue.run();
+ // Release weights in case buffer is pretransposed
+ if(!_weights_reshaped.is_used())
+ {
+ _weights_reshaped.allocator()->free();
+ }
}
else
{
@@ -638,6 +634,11 @@
}
}
+ if(_skip_im2col && _append_bias)
+ {
+ NEScheduler::get().schedule(&_add_bias_kernel, Window::DimY);
+ }
+
// Run output stage for quantized case
if(_is_quantized)
{
@@ -645,7 +646,15 @@
}
// Reshape output matrix
- NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+ if(_data_layout == DataLayout::NCHW)
+ {
+ NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+ }
+
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
_memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index 9b36e81..98b4767 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2017 ARM Limited.
+/* Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,13 +25,9 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
@@ -39,20 +35,11 @@
#include "arm_compute/runtime/TensorAllocator.h"
#include "support/ToolchainSupport.h"
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp"
-} // namespace arm_compute
-
using namespace arm_compute;
NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(), _workspace()
+ : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(),
+ _workspace(), _B_pretransposed()
{
}
@@ -65,89 +52,29 @@
ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+ bool run_optimised = false;
#ifdef __aarch64__
- const int M = output->info()->tensor_shape().y();
- const int N = output->info()->tensor_shape().x();
- const int K = a->info()->tensor_shape().x();
- constexpr size_t workspace_alignment = 4096;
- const struct CPUInfo ci = NEScheduler::get().cpu_info();
+ switch(a->info()->data_type())
+ {
+ case DataType::S8:
+ {
+ run_optimised = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue_signed);
+ break;
+ }
+ case DataType::QASYMM8:
+ case DataType::U8:
+ {
+ run_optimised = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue_unsigned);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Datatype not supported");
+ break;
+ }
+ }
#endif /* __aarch64__ */
-
-#ifdef ARM_COMPUTE_AARCH64_V8_2
- if(ci.CPU == CPUTarget::A75_DOT || ci.CPU == CPUTarget::A55_DOT)
- {
- // Configure matrix multiply kernel
- GemmInterleaved<gemm_s8_12x8, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- _memory_group.manage(&_workspace);
-
- // Configure matrix multiplication kernel
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
- k->configure(a, b, output, &_workspace, 1.f, 1.f);
- _mm_kernel = std::move(k);
- _workspace.allocator()->allocate();
- }
- else
-#elif defined(ARM_COMPUTE_AARCH64_V8A)
- if(ci.CPU == CPUTarget::A53)
- {
- switch(a->info()->data_type())
- {
- case DataType::S8:
- {
- // Configure matrix multiply kernel
- GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- }
- break;
- case DataType::U8:
- {
- // Configure matrix multiply kernel
- GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t> gemm(&ci, M, N, K, false, false);
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Datatype not supported");
- }
-
- _memory_group.manage(&_workspace);
- // Configure matrix multiplication kernel
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64A53Kernel>();
- k->configure(a, b, output, &_workspace, 1.f, 1.f);
- _mm_kernel = std::move(k);
- _workspace.allocator()->allocate();
- }
- else if(1) // Generic v8a kernel
- {
- switch(a->info()->data_type())
- {
- case DataType::S8:
- {
- // Configure matrix multiply kernel
- GemmInterleaved<gemm_s8_4x4, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- }
- break;
- case DataType::U8:
- {
- // Configure matrix multiply kernel
- GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t> gemm(&ci, M, N, K, false, false);
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Datatype not supported");
- }
- _memory_group.manage(&_workspace);
- // Configure matrix multiplication kernel
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64Kernel>();
- k->configure(a, b, output, &_workspace, 1.f, 1.f);
- _mm_kernel = std::move(k);
- _workspace.allocator()->allocate();
- }
- else
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+ if(!run_optimised)
{
// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
TensorShape shape_tmp_a = a->info()->tensor_shape();
@@ -206,7 +133,18 @@
NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
}
- NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+ if(_asm_glue_unsigned._optimised_kernel != nullptr)
+ {
+ _asm_glue_unsigned.run();
+ }
+ else if(_asm_glue_signed._optimised_kernel != nullptr)
+ {
+ _asm_glue_signed.run();
+ }
+ else
+ {
+ NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+ }
_memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index ad47593..2e06fa2 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -26,11 +26,9 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
@@ -39,58 +37,48 @@
#include "arm_compute/runtime/TensorAllocator.h"
#include "support/ToolchainSupport.h"
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp"
-} // namespace arm_compute
-
using namespace arm_compute;
using namespace arm_compute::misc::shape_calculator;
NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
- _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _dot_product_path(false)
+ : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(),
+ _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _B_pretranspose(), _a_offset(0), _b_offset(0),
+ _run_vector_matrix_multiplication(false), _dot_product_path(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
{
}
void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
- ARM_COMPUTE_UNUSED(gemm_info);
ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
_a_offset = a->info()->quantization_info().offset;
_b_offset = b->info()->quantization_info().offset;
_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+ _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
-#ifdef ARM_COMPUTE_AARCH64_V8_2
- // Check for DOT product instruction
- const struct CPUInfo ci = NEScheduler::get().cpu_info();
- const int cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);
-
- if(cpu_has_dotprod != 0)
+#ifdef __aarch64__
+ switch(a->info()->data_type())
{
- _dot_product_path = true;
-
- // Configure matrix multiply kernel
- struct CPUInfo ci = NEScheduler::get().cpu_info();
- const int M = output->info()->tensor_shape().y();
- const int N = output->info()->tensor_shape().x();
- const int K = a->info()->tensor_shape().x();
-
- const size_t workbench_size = GemmInterleaved<gemm_u8_12x8, gemm_u8_12x8::operand_type, gemm_u8_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();
- constexpr size_t alignment = 4096;
- _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- _memory_group.manage(&_workspace);
-
- // Configure matrix multiplication kernel
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
- k->configure(a, b, output, &_workspace, 1.f, 1.f, false, false);
- _mm_kernel = std::move(k);
+ case DataType::S8:
+ {
+ _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_signed);
+ break;
+ }
+ case DataType::QASYMM8:
+ case DataType::U8:
+ {
+ _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_unsigned);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Datatype not supported");
+ break;
+ }
}
- else
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+#endif /* __aarch64__ */
+ if(!_dot_product_path)
{
if(_run_vector_matrix_multiplication)
{
@@ -110,7 +98,10 @@
_tmp_a.allocator()->init(info_a);
_tmp_b.allocator()->init(info_b);
_memory_group.manage(&_tmp_a);
- _memory_group.manage(&_tmp_b);
+ if(!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_tmp_b);
+ }
// Configure interleave kernel
{
@@ -141,7 +132,10 @@
TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
_vector_sum_col.allocator()->init(info_vector_sum_col);
- _memory_group.manage(&_vector_sum_col);
+ if(!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_vector_sum_col);
+ }
// Configure Matrix B reduction kernel
_mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);
@@ -168,10 +162,6 @@
_tmp_a.allocator()->allocate();
_tmp_b.allocator()->allocate();
}
- else
- {
- _workspace.allocator()->allocate();
- }
if(_a_offset != 0)
{
@@ -203,42 +193,28 @@
int32_t b_offset = b->quantization_info().offset;
bool run_vector_matrix_multiplication = a->dimension(1) < 2;
-#ifdef ARM_COMPUTE_AARCH64_V8_2
- // Check for DOT product instruction
- const struct CPUInfo ci = NEScheduler::get().cpu_info();
- const int cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);
-
- if(cpu_has_dotprod != 0)
+ if(!run_vector_matrix_multiplication)
{
- // Validate matrix multiply kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpAArch64V8P4Kernel::validate(a, b, output));
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+ TensorShape shape_tmp_a = a->tensor_shape();
+ shape_tmp_a.set(0, a->dimension(0) * 4);
+ shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+ TensorShape shape_tmp_b = b->tensor_shape();
+ shape_tmp_b.set(0, b->dimension(1) * 16);
+ shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+ TensorInfo info_a(shape_tmp_a, 1, a->data_type());
+ TensorInfo info_b(shape_tmp_b, 1, b->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
}
else
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
{
- if(!run_vector_matrix_multiplication)
- {
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorShape shape_tmp_a = a->tensor_shape();
- shape_tmp_a.set(0, a->dimension(0) * 4);
- shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
-
- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
- TensorShape shape_tmp_b = b->tensor_shape();
- shape_tmp_b.set(0, b->dimension(1) * 16);
- shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
-
- TensorInfo info_a(shape_tmp_a, 1, a->data_type());
- TensorInfo info_b(shape_tmp_b, 1, b->data_type());
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
- }
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
}
TensorInfo info_vector_sum_col, info_vector_sum_row;
@@ -282,13 +258,24 @@
NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
}
- if(_mtx_b_reshape_kernel)
+ if(_mtx_b_reshape_kernel && (_is_first_run || !_reshape_b_only_on_first_run))
{
NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
}
}
- NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+ if(_asm_glue_unsigned._optimised_kernel != nullptr)
+ {
+ _asm_glue_unsigned.run();
+ }
+ else if(_asm_glue_signed._optimised_kernel != nullptr)
+ {
+ _asm_glue_signed.run();
+ }
+ else
+ {
+ NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+ }
// Run matrix A reduction kernel only if _b_offset is not equal to 0
if(_b_offset != 0)
@@ -297,7 +284,7 @@
}
// Run matrix B reduction kernel only if _a_offset is not equal to 0
- if(_a_offset != 0)
+ if(_a_offset != 0 && (_is_first_run || !_reshape_b_only_on_first_run))
{
NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
}
@@ -306,4 +293,6 @@
NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
_memory_group.release();
+
+ _is_first_run = false;
}
diff --git a/src/runtime/NEON/functions/NEIm2Col.cpp b/src/runtime/NEON/functions/NEIm2Col.cpp
index b962db9..6b95cb0 100644
--- a/src/runtime/NEON/functions/NEIm2Col.cpp
+++ b/src/runtime/NEON/functions/NEIm2Col.cpp
@@ -23,19 +23,30 @@
*/
#include "arm_compute/runtime/NEON/functions/NEIm2Col.h"
-#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "support/ToolchainSupport.h"
using namespace arm_compute;
-void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected)
+NEIm2Col::NEIm2Col()
+ : _kernel(), _y_dim(1)
{
- auto k = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
- k->configure(input, output, kernel_dims, conv_info, has_bias, is_fully_connected);
- _kernel = std::move(k);
}
-Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected)
+void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten)
{
- return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, is_fully_connected);
+ _y_dim = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+
+ _kernel.configure(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten);
+}
+
+Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten)
+{
+ return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten);
+}
+
+void NEIm2Col::run()
+{
+ NEScheduler::get().schedule(&_kernel, _y_dim);
}
diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index fa62483..d0b80fb 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,6 +46,26 @@
_sumsq.allocator()->allocate();
}
+Status NEL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, float epsilon)
+{
+ TensorShape shape(input->tensor_shape());
+
+ // Create intermediate tensor info
+ TensorInfo sum_sq;
+ sum_sq.set_data_type(input->data_type());
+ sum_sq.set_tensor_shape(shape);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, axis, ReductionOperation::SUM_SQUARE));
+
+ // Reduce shape on axis (supported axis is 0)
+ shape.set(0, 1);
+ sum_sq.set_tensor_shape(shape);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEL2NormalizeLayerKernel::validate(input, &sum_sq, output, axis, epsilon));
+
+ return Status{};
+}
+
void NEL2NormalizeLayer::run()
{
_memory_group.acquire();
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index 45ddb70..913acf8 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -33,39 +33,102 @@
using namespace arm_compute;
+namespace
+{
+void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ TensorShape &shape_wr, TensorShape &shape_im2col, TensorShape &shape_gemm)
+{
+ ARM_COMPUTE_UNUSED(output);
+
+ const unsigned int kernel_width = weights->dimension(0);
+ const unsigned int kernel_height = weights->dimension(1);
+
+ bool has_bias = (biases != nullptr);
+
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
+ conv_info);
+
+ const size_t mat_weights_cols = weights->dimension(3);
+ const size_t mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + ((has_bias) ? 1 : 0);
+ const size_t mat_weights_num = weights->dimension(4);
+
+ shape_wr = TensorShape(mat_weights_cols, mat_weights_rows, mat_weights_num);
+
+ const size_t mat_input_cols = mat_weights_rows;
+ const size_t mat_input_rows = conv_w * conv_h;
+
+ shape_im2col = input->tensor_shape();
+ shape_im2col.set(0, mat_input_cols);
+ shape_im2col.set(1, mat_input_rows);
+ shape_im2col.set(2, 1);
+
+ shape_gemm = shape_im2col;
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, mat_input_rows);
+}
+} // namespace
+
NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
- _is_first_run(false)
+ _is_first_run(false), _original_weights(nullptr)
{
}
+Status NELocallyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
+ ARM_COMPUTE_RETURN_ERROR_ON(!conv_info.padding_is_symmetric());
+
+ bool has_bias = (biases != nullptr);
+
+ if(has_bias)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 2);
+ }
+
+ const unsigned int kernel_width = weights->dimension(0);
+ const unsigned int kernel_height = weights->dimension(1);
+
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
+ conv_info);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != conv_w) || (output->dimension(1) != conv_h), "Output shape does not match the expected one");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+ // Calculate intermediate buffer shapes
+ TensorShape shape_wr;
+ TensorShape shape_im2col;
+ TensorShape shape_gemm;
+ calculate_shapes(input, weights, biases, output, conv_info, shape_wr, shape_im2col, shape_gemm);
+
+ TensorInfo weights_reshaped_info(shape_wr, 1, weights->data_type());
+ TensorInfo input_im2col_reshaped_info(shape_im2col, 1, input->data_type());
+ TensorInfo gemm_output_info(shape_gemm, 1, input->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias, false));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NELocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
+
+ return Status{};
+}
+
void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
- ARM_COMPUTE_ERROR_ON(!conv_info.padding_is_symmetric());
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_THROW_ON(NELocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info));
- if(biases != nullptr)
- {
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
- ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
- }
-
- bool _has_bias = (biases != nullptr);
- _is_first_run = true;
-
- // Get parameters for conv_info
- unsigned int stride_x = 0;
- unsigned int stride_y = 0;
- unsigned int pad_x = 0;
- unsigned int pad_y = 0;
- std::tie(stride_x, stride_y) = conv_info.stride();
- std::tie(pad_x, pad_y) = conv_info.pad();
+ bool _has_bias = (biases != nullptr);
+ _is_first_run = true;
+ _original_weights = weights;
const unsigned int kernel_width = weights->info()->dimension(0);
const unsigned int kernel_height = weights->info()->dimension(1);
@@ -76,32 +139,14 @@
std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
conv_info);
- ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
- ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
-
- // Create tensor to store the reshaped weights
- const size_t mat_weights_cols = weights->info()->dimension(3);
- const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
- const size_t mat_weights_num = weights->info()->dimension(4);
-
- const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
+ // Calculate intermediate buffer shapes
+ TensorShape shape_wr;
+ TensorShape shape_im2col;
+ TensorShape shape_gemm;
+ calculate_shapes(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info, shape_wr, shape_im2col, shape_gemm);
_weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
-
- // Create tensor to store im2col reshaped inputs
- const size_t mat_input_cols = mat_weights_rows;
- const size_t mat_input_rows = conv_w * conv_h;
- TensorShape shape_im2col = input->info()->tensor_shape();
- shape_im2col.set(0, mat_input_cols);
- shape_im2col.set(1, mat_input_rows);
- shape_im2col.set(2, 1);
-
_input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
-
- // Create locally connected layer output tensor
- TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
- shape_gemm.set(0, mat_weights_cols);
- shape_gemm.set(1, mat_input_rows);
_gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
// Manage intermediate buffers
@@ -125,8 +170,13 @@
// Run weights reshaping (Runs once for every configure)
if(_is_first_run)
{
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
_is_first_run = false;
NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+
+ // Mark original weights tensor as unused
+ _original_weights->mark_as_unused();
}
_memory_group.acquire();
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index 5a474e4..cf6b984 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
#include "support/ToolchainSupport.h"
@@ -30,11 +31,21 @@
using namespace arm_compute;
-void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+void NEPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
{
auto k = arm_compute::support::cpp14::make_unique<NEPixelWiseMultiplicationKernel>();
k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
_kernel = std::move(k);
+
+ if(output->info()->dimension(0) > 1)
+ {
+ ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if(broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
}
Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
{
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index bc0b6f8..cbfd684 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -31,7 +31,7 @@
using namespace arm_compute;
NEPoolingLayer::NEPoolingLayer()
- : _pooling_layer_kernel(), _border_handler(), _is_global_pooling_layer(false)
+ : _pooling_layer_kernel(), _border_handler(), _is_global_pooling_layer(false), _data_layout(DataLayout::NCHW)
{
}
@@ -40,17 +40,31 @@
// Check if we have Global Pooling Layer
_is_global_pooling_layer = (input->info()->dimension(0) == pool_info.pool_size().width) && (input->info()->dimension(1) == pool_info.pool_size().height);
+ // Get data layout
+ _data_layout = input->info()->data_layout();
+
// Configure pooling kernel
_pooling_layer_kernel.configure(input, output, pool_info);
- // Configure border depending on operation required (quantize border in case of asymmetric data_type)
- BorderMode border_mode = (pool_info.pool_type() == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
- PixelValue zero_value(0.f);
- if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
+ switch(_data_layout)
{
- zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+ case DataLayout::NCHW:
+ {
+ // Configure border depending on operation required (quantize border in case of asymmetric data_type)
+ BorderMode border_mode = (pool_info.pool_type() == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+ PixelValue zero_value(0.f);
+ if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
+ {
+ zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+ }
+ _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, zero_value);
+ break;
+ }
+ case DataLayout::NHWC:
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data layout not supported");
}
- _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, zero_value);
}
Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
@@ -60,9 +74,20 @@
void NEPoolingLayer::run()
{
- // Fill border
- NEScheduler::get().schedule(&_border_handler, Window::DimY);
+ switch(_data_layout)
+ {
+ case DataLayout::NCHW:
+ // Fill border
+ NEScheduler::get().schedule(&_border_handler, Window::DimY);
- // Run pooling layer
- NEScheduler::get().schedule(&_pooling_layer_kernel, _is_global_pooling_layer ? Window::DimZ : Window::DimY);
+ // Run pooling layer
+ NEScheduler::get().schedule(&_pooling_layer_kernel, _is_global_pooling_layer ? Window::DimZ : Window::DimY);
+ break;
+ case DataLayout::NHWC:
+ // Run pooling layer
+ NEScheduler::get().schedule(&_pooling_layer_kernel, Window::DimX);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data layout not supported");
+ }
}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index a131c48..8f7db96 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,7 @@
#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
using namespace arm_compute;
@@ -34,8 +35,21 @@
{
}
+Status NEQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+ TensorInfo min_max{ input->num_channels(), input->data_type() };
+ ARM_COMPUTE_RETURN_ON_ERROR(NEMinMaxLayerKernel::validate(input, &min_max));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(input, output, &min_max));
+
+ return Status{};
+}
+
void NEQuantizationLayer::configure(const ITensor *input, ITensor *output)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
// Configure min-max kernel. _min_max tensor will be auto-configured within the kernel
_min_max_kernel.configure(input, &_min_max);
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index f1a9145..cd0b42f 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -63,6 +63,13 @@
{
}
+Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, axis, op));
+
+ return Status{};
+}
+
void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index bd565c9..a9c85bd 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,6 @@
void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size, SamplingPolicy sampling_policy)
{
ARM_COMPUTE_ERROR_ON(nullptr == offsets);
- ARM_COMPUTE_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
ARM_COMPUTE_UNUSED(sampling_policy);
Window win;
@@ -66,7 +65,7 @@
const int in_xi = std::floor(in_x);
const int in_yi = std::floor(in_y);
- *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
+ *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * static_cast<int>(input_element_size);
*reinterpret_cast<float *>(dx_it.ptr()) = in_x - in_xi;
*reinterpret_cast<float *>(dy_it.ptr()) = in_y - in_yi;
},
@@ -99,20 +98,20 @@
void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
{
- ARM_COMPUTE_ERROR_ON(nullptr == input);
- ARM_COMPUTE_ERROR_ON(nullptr == output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(NEScale::validate(input->info(), output->info(), policy, border_mode, constant_border_value, sampling_policy));
- for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
- {
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
- }
+ // Get data layout and width/height indices
+ const DataLayout data_layout = input->info()->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
// Get the tensor shape
- const TensorShape shape(output->info()->dimension(0), output->info()->dimension(1));
+ const TensorShape shape(output->info()->dimension(idx_width), output->info()->dimension(idx_height));
// Compute the ratio between source width/height and destination width/height
- const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
- const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
+ const auto wr = static_cast<float>(input->info()->dimension(idx_width)) / static_cast<float>(output->info()->dimension(idx_width));
+ const auto hr = static_cast<float>(input->info()->dimension(idx_height)) / static_cast<float>(output->info()->dimension(idx_height));
// Get the element size of the input image
const size_t input_element_size = input->info()->element_size();
@@ -123,9 +122,6 @@
policy = InterpolationPolicy::NEAREST_NEIGHBOR;
}
- // Check if the border mode is UNDEFINED
- const bool border_undefined = border_mode == BorderMode::UNDEFINED;
-
switch(policy)
{
case InterpolationPolicy::NEAREST_NEIGHBOR:
@@ -133,7 +129,7 @@
TensorInfo tensor_info_offsets(shape, Format::S32);
_offsets.allocator()->init(tensor_info_offsets);
- _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined, sampling_policy);
+ _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_mode, sampling_policy);
// Allocate once the configure methods have been called
_offsets.allocator()->allocate();
@@ -151,7 +147,7 @@
_dx.allocator()->init(tensor_info_dxdy);
_dy.allocator()->init(tensor_info_dxdy);
- _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined, sampling_policy);
+ _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_mode, sampling_policy);
// Allocate once the configure methods have been called
_offsets.allocator()->allocate();
@@ -164,7 +160,7 @@
}
case InterpolationPolicy::AREA:
{
- _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_undefined);
+ _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_mode);
break;
}
default:
@@ -174,6 +170,48 @@
_border_handler.configure(input, _scale_kernel.border_size(), border_mode, PixelValue(constant_border_value));
}
+Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
+ BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
+ ARM_COMPUTE_UNUSED(border_mode, constant_border_value);
+
+ ITensorInfo *offsets = nullptr;
+ ITensorInfo *dx = nullptr;
+ ITensorInfo *dy = nullptr;
+
+ // Get data layout and width/height indices
+ const DataLayout data_layout = input->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ // Get the tensor shape of auxilary buffers
+ const TensorShape shape(output->dimension(idx_width), output->dimension(idx_height));
+
+ TensorInfo tensor_info_offsets(shape, Format::S32);
+ TensorInfo tensor_info_dx(shape, Format::F32);
+ TensorInfo tensor_info_dy(shape, Format::F32);
+
+ switch(policy)
+ {
+ case InterpolationPolicy::NEAREST_NEIGHBOR:
+ offsets = &tensor_info_offsets;
+ break;
+ case InterpolationPolicy::BILINEAR:
+ offsets = &tensor_info_offsets;
+ dx = &tensor_info_dx;
+ dy = &tensor_info_dy;
+ break;
+ default:
+ break;
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEScaleKernel::validate(input->clone().get(), dx, dy, offsets, output->clone().get(),
+ policy, border_mode, sampling_policy));
+ return Status{};
+}
+
void NEScale::run()
{
NEScheduler::get().schedule(&_border_handler, Window::DimZ);
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
new file mode 100644
index 0000000..8f2c4c4
--- /dev/null
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/AssemblyHelper.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+#include "arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
+
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+
+namespace arm_compute
+{
+namespace
+{
+inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
+{
+ const DataLayout data_layout = input->info()->data_layout();
+ const int in_width = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
+ const int in_height = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
+ const int in_channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+ const int in_batches = input->info()->dimension(3);
+
+ return Tensor4DShape({ in_batches, in_height, in_width, in_channels });
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+ const DataLayout data_layout = input->data_layout();
+ const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ ARM_COMPUTE_UNUSED(output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW); // COMPMID-1162
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 3 && weights->dimension(height_idx) != 5, "Only 3 and 5 kernels are supported");
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
+
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ }
+
+ return Status{};
+}
+
+Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims)
+{
+ Size2D output_tile = Size2D{};
+
+ if(kernel_dims == Size2D(3U, 3U))
+ {
+ output_tile = (input_dims.width <= 4 && input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+ }
+ else if(kernel_dims == Size2D(5U, 5U))
+ {
+ output_tile = Size2D(2U, 2U);
+ }
+
+ return output_tile;
+}
+
+bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size)
+{
+ // Check if we want to configure a Winograd configuration which requires fast math
+ using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+ std::vector<WinogradConfiguration> fast_math_winograd =
+ {
+ WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(5, 5)),
+ WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
+ };
+
+ auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
+ std::pair<int, int>(kernel_size.width, kernel_size.height));
+
+ return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
+}
+} //namespace
+
+NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _arm_gemm(nullptr), _gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr),
+ _activationlayer_function(), _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(),
+ _workspace(), _input(), _weights(), _output(), _reshaped_kernel(false), _is_activationlayer_enabled(false)
+{
+} /* arm_compute */
+
+void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info));
+
+ // Get indices for the width and height
+ const DataLayout data_layout = input->info()->data_layout();
+ const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ const Size2D input_dims = Size2D(input->info()->dimension(width_idx), input->info()->dimension(height_idx));
+ const Size2D kernel_size = Size2D(weights->info()->dimension(width_idx), weights->info()->dimension(height_idx));
+ const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+
+ // Check if the Winograd configuration requires fast math
+ if(!enable_fast_math)
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+ }
+
+ _weights = weights;
+ _input = input;
+ _output = output;
+
+ std::unique_ptr<INEWinogradLayerTransformInputKernel<float>> transform_input_kernel;
+ std::unique_ptr<INEWinogradLayerTransformWeightsKernel<float>> transform_weights_kernel;
+ std::unique_ptr<INEWinogradLayerTransformOutputKernel<float>> transform_output_kernel;
+
+ int n_gemms = 0;
+ int N_BLOCK = 0; // Size of block used by GEMM.
+
+ switch(kernel_size.width)
+ {
+ case 3:
+ {
+ if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4)
+ {
+ transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>>();
+ transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>>();
+ transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>>();
+ n_gemms = NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>::WinogradBase::N_GEMMS;
+ N_BLOCK = NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>::WinogradConv::N_BLOCK;
+ }
+ else
+ {
+ transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>>();
+ transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>>();
+ transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>>();
+ n_gemms = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradBase::N_GEMMS;
+ N_BLOCK = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradConv::N_BLOCK;
+ }
+ break;
+ }
+ case 5:
+ {
+ transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>>();
+ transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>>();
+ transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>>();
+ n_gemms = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradBase::N_GEMMS;
+ N_BLOCK = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradConv::N_BLOCK;
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Not supported.");
+ break;
+ }
+ }
+
+ const PaddingType use_padding_type = (conv_info.pad_left() != 0u) ? PADDING_SAME : PADDING_VALID;
+ const bool use_same_padding = use_padding_type == PADDING_SAME;
+
+ // Get convolved dimensions
+ const int in_channels = input->info()->dimension(channel_idx);
+ const int out_channels = output->info()->dimension(channel_idx);
+
+ const Tensor4DShape in_shape(internal_get_input_shape(input));
+ const size_t data_type_size = input->info()->element_size();
+ // Get the memory required to instantiate a new Winograd operator.
+ constexpr size_t storage_alignment = 64;
+ const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
+ _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8));
+ _kernel_storage.allocator()->allocate();
+ // Input storage
+ const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size;
+ _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8));
+ _input_workspace.allocator()->allocate();
+
+ // Output storage
+ const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, use_same_padding) * data_type_size;
+ _output_workspace.allocator()->init(TensorInfo(TensorShape{ (output_storage_size + storage_alignment - 1) }, 1, DataType::U8));
+ _output_workspace.allocator()->allocate();
+
+ // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
+ TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
+ _output->info()->dimension(1), _output->info()->dimension(3)),
+ 1, _output->info()->data_type());
+ _output_nhwc.allocator()->init(info);
+ _output_nhwc.allocator()->allocate();
+
+ // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+ _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
+ _weights_hwio.allocator()->allocate();
+
+ // configure the kernel to transform the input tensor from NCHW -> NHWC
+ _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+ _input_nhwc.allocator()->allocate();
+
+ const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
+
+ // Configure the InputTransform
+ const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
+ transform_input_kernel->configure(reinterpret_cast<float *>(_input_nhwc.buffer()), in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
+ reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_stride);
+
+ // Configure WeightsTransform
+ const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
+ transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast<float *>(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels);
+
+ // Configure OutputTransform
+ //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
+ const int output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
+ const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type));
+
+ transform_output_kernel->configure(biases, reinterpret_cast<float *>(_output_workspace.buffer()),
+ output_matrix_stride, reinterpret_cast<float *>(_output_nhwc.buffer()),
+ in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
+
+ // Configure GEMM
+ const int tile_rows = iceildiv(output_shape.n_rows, output_tile.height);
+ const int tile_cols = iceildiv(output_shape.n_cols, output_tile.width);
+ const int m = in_shape.n_batches * tile_rows * tile_cols;
+ const int k = in_shape.n_channels;
+ const int n = out_channels;
+ const int input_matrix_row_stride = in_shape.n_channels;
+ const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
+ const int output_matrix_row_stride = kernel_matrix_row_stride;
+ unsigned int num_threads = NEScheduler::get().num_threads();
+
+ _arm_gemm = arm_gemm::gemm<float, float>(NEScheduler::get().cpu_info(), m, n, k, 1, n_gemms, false, false, 1.f, 0.f, num_threads, false);
+ _arm_gemm->set_arrays(reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_row_stride, 0, input_matrix_stride, reinterpret_cast<float *>(_kernel_storage.buffer()),
+ kernel_matrix_row_stride, kernel_matrix_stride, reinterpret_cast<float *>(_output_workspace.buffer()), output_matrix_row_stride, 0, output_matrix_stride);
+
+ auto acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapper<arm_gemm::GemmCommon<float, float>>>();
+ acl_gemm_wrapper->configure(_arm_gemm.get());
+ const size_t workspace_size = _arm_gemm->get_working_size();
+
+ // Allocate workspace
+ if(workspace_size > 0)
+ {
+ const unsigned int alignment = 4096;
+ allocate_workspace(workspace_size, _workspace, &_memory_group, alignment, 1);
+ _arm_gemm->set_working_space(reinterpret_cast<float *>(_workspace.buffer()));
+ }
+
+ const unsigned int window_size = _arm_gemm->get_window_size();
+ if(window_size < num_threads)
+ {
+ num_threads = window_size;
+ _arm_gemm->set_nthreads(num_threads);
+ }
+
+ _gemm_kernel = std::move(acl_gemm_wrapper);
+
+ // Reorder the convoluted output to ACL's ordering NCHW
+ _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
+
+ _transform_input_kernel = std::move(transform_input_kernel);
+ _transform_weights_kernel = std::move(transform_weights_kernel);
+ _transform_output_kernel = std::move(transform_output_kernel);
+
+ //Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
+}
+
+void NEWinogradConvolutionLayer::run()
+{
+ _memory_group.acquire();
+ if(!_reshaped_kernel)
+ {
+ _reshaped_kernel = true;
+ _permute_weights.run();
+ NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
+ }
+ //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
+ _permute_input.run();
+
+ // Transform input tensor to the winograd domain
+ NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
+
+ //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
+ NEScheduler::get().schedule(_gemm_kernel.get(), Window::DimX);
+
+ // Transform output tensor to the spatial domain
+ NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
+
+ // Reorder the convoluted output to ACL's ordering NCHW
+ _permute_output.run();
+
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
+ _memory_group.release();
+}
+
+Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info, bool enable_fast_math)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
+
+ // Get indices for the width and height
+ const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+ // Input shape, kernel size and output tile
+ const Size2D input_dims = Size2D(input->dimension(idx_width), input->dimension(idx_height));
+ const Size2D kernel_size = Size2D(weights->dimension(idx_width), weights->dimension(idx_height));
+ const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+
+ // Check if the Winograd configuration requires fast math
+ if(!enable_fast_math)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+ }
+
+ const WinogradInfo winograd_info = WinogradInfo(output_tile,
+ kernel_size,
+ input_dims,
+ conv_info,
+ input->data_layout());
+
+ // Validate input transform
+ const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+ const TensorInfo input0 = input->clone()->set_tensor_shape(input0_shape);
+ switch(weights->dimension(idx_width))
+ {
+ case 3:
+ {
+ if(input_dims.width > 4 && input_dims.height > 4)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>::validate(input, &input0, winograd_info)));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>::validate(input, &input0, winograd_info)));
+ }
+ break;
+ }
+ case 5:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>::validate(input, &input0, winograd_info)));
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
+ break;
+ }
+ }
+ // Validate filter transform
+ const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
+ const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
+
+ switch(weights->dimension(idx_width))
+ {
+ case 3:
+ {
+ if(input_dims.width > 4 && input_dims.height > 4)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>::validate(weights, &input1, winograd_info)));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>::validate(weights, &input1, winograd_info)));
+ }
+ break;
+ }
+ case 5:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>::validate(weights, &input1, winograd_info)));
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
+ break;
+ }
+ }
+ // Validate batched matrix multiply
+ TensorShape batched_mm_output_shape = input0.tensor_shape();
+ batched_mm_output_shape[0] = input1.tensor_shape()[0];
+ const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape);
+ switch(weights->dimension(idx_width))
+ {
+ case 3:
+ {
+ if(input_dims.width > 4 && input_dims.height > 4)
+ {
+ // Validate output transform
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>::validate(&batched_mm_output, biases, output, winograd_info)));
+ }
+ else
+ {
+ // Validate output transform
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>::validate(&batched_mm_output, biases, output, winograd_info)));
+ }
+ break;
+ }
+ case 5:
+ {
+ // Validate output transform
+ ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>::validate(&batched_mm_output, biases, output, winograd_info)));
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
+ break;
+ }
+ }
+
+ // Validate Activation Layer
+ if(act_info.enabled())
+ {
+ NEActivationLayer::validate(output, nullptr, act_info);
+ }
+ return Status{};
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp
deleted file mode 100644
index 0ac6d09..0000000
--- a/src/runtime/NEON/functions/NEWinogradLayer.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEWinogradLayer.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "support/ToolchainSupport.h"
-
-#include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h"
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-
-namespace
-{
-inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
-{
- const int in_width = input->info()->dimension(0);
- const int in_height = input->info()->dimension(1);
- const int in_batches = input->info()->dimension(3);
- const int in_channels = input->info()->dimension(2);
- return Tensor4DShape({ in_batches, in_height, in_width, in_channels });
-}
-} /* namespace */
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, biases);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != 3 && weights->dimension(0) != 5, "Only 3 and 5 kernels are supported");
- ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
- if(biases != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- }
-
- // Get parameters from conv_info
- unsigned int stride_x = 0;
- unsigned int stride_y = 0;
- std::tie(stride_x, stride_y) = conv_info.stride();
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(stride_y != 1 || stride_x != 1, "Winograd layer only supports unit strides.");
-
- ARM_COMPUTE_UNUSED(output);
-
- return Status{};
-}
-} //namespace
-
-NEWinogradLayer::NEWinogradLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _batched_gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _permute_input(),
- _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(),
- _reshaped_kernel(false)
-{
-} /* arm_compute */
-
-void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, biases, output);
- ARM_COMPUTE_UNUSED(conv_info);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), biases->info(), output->info(), conv_info));
-
- _weights = weights;
- _input = input;
- _output = output;
-
- std::unique_ptr<INEWinogradLayerBatchedGEMMKernel<float, float>> batched_gemm_kernel;
- std::unique_ptr<INEWinogradLayerTransformInputKernel<float>> transform_input_kernel;
- std::unique_ptr<INEWinogradLayerTransformWeightsKernel<float>> transform_weights_kernel;
- std::unique_ptr<INEWinogradLayerTransformOutputKernel<float>> transform_output_kernel;
-
- switch(weights->info()->dimension(0))
- {
- case 3:
- {
- batched_gemm_kernel = support::cpp14::make_unique<NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>>();
- transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>>();
- transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>>();
- transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>>();
- break;
- }
- case 5:
- {
- batched_gemm_kernel = support::cpp14::make_unique<NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>>();
- transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>>();
- transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>>();
- transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>>();
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Not supported.");
- break;
- }
- }
-
- const PaddingType use_padding_type = (conv_info.pad_left() != 0u) ? PADDING_SAME : PADDING_VALID;
- const bool use_same_padding = use_padding_type == PADDING_SAME;
-
- // Get parameters from conv_info
- unsigned int stride_x = 0;
- unsigned int stride_y = 0;
- std::tie(stride_x, stride_y) = conv_info.stride();
- ARM_COMPUTE_ERROR_ON_MSG(stride_y != 1 || stride_x != 1, "Winograd layer only supports unit strides.");
-
- // Get convolved dimensions
- const int in_channels = input->info()->dimension(2);
- const int out_channels = output->info()->dimension(2);
-
- const Tensor4DShape in_shape(internal_get_input_shape(input));
- const size_t data_type_size = input->info()->element_size();
- // Get the memory required to instantiate a new Winograd operator.
- constexpr size_t storage_alignment = 64;
- const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
- _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8));
- _kernel_storage.allocator()->allocate();
- // Input storage
- const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size;
- _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8));
- _input_workspace.allocator()->allocate();
-
- // Output storage
- const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, use_same_padding) * data_type_size;
- _output_workspace.allocator()->init(TensorInfo(TensorShape{ (output_storage_size + storage_alignment - 1) }, 1, DataType::U8));
- _output_workspace.allocator()->allocate();
-
- // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
- TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
- _output->info()->dimension(1), _output->info()->dimension(3)),
- 1, _output->info()->data_type());
- _output_nhwc.allocator()->init(info);
- _output_nhwc.allocator()->allocate();
-
- // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
- _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
- _weights_hwio.allocator()->allocate();
-
- // configure the kernel to transform the input tensor from NCHW -> NHWC
- _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
- _input_nhwc.allocator()->allocate();
-
- const int weights_width = weights->info()->dimension(0);
- const int weights_height = weights->info()->dimension(1);
- const KernelShape kernel_shape({ out_channels, weights_height, weights_width, in_channels });
-
- // Configure the InputTransform
- const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
- transform_input_kernel->configure(reinterpret_cast<float *>(_input_nhwc.buffer()), in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
- reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_stride);
-
- // Configure WeightsTransform
- const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
- transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast<float *>(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels);
-
- // Configure OutputTransform
- //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
- const int output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
- const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type));
-
- transform_output_kernel->configure(biases, reinterpret_cast<float *>(_output_workspace.buffer()),
- output_matrix_stride, reinterpret_cast<float *>(_output_nhwc.buffer()),
- in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
-
- // Configure Batched GEMMs
- const int output_tile_rows = batched_gemm_kernel->get_output_tile_rows();
- const int output_tile_cols = batched_gemm_kernel->get_output_tile_cols();
- const int n_block = batched_gemm_kernel->get_number_blocks();
- const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows);
- const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols);
- const int m = in_shape.n_batches * tile_rows * tile_cols;
- const int k = in_shape.n_channels;
- const int n = out_channels;
- const int input_matrix_row_stride = in_shape.n_channels;
- const int kernel_matrix_row_stride = roundup(out_channels, n_block);
- const int output_matrix_row_stride = kernel_matrix_row_stride;
- const unsigned n_gemms = batched_gemm_kernel->get_number_gemms();
-
- batched_gemm_kernel->configure(n_gemms, m, k, n,
- input_matrix_stride, input_matrix_row_stride,
- kernel_matrix_stride, kernel_matrix_row_stride,
- output_matrix_stride, output_matrix_row_stride,
- reinterpret_cast<float *>(_input_workspace.buffer()),
- reinterpret_cast<float *>(_kernel_storage.buffer()),
- reinterpret_cast<float *>(_output_workspace.buffer()));
-
- // Reorder the convoluted output to ACL's ordering NCHW
- _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
-
- _transform_input_kernel = std::move(transform_input_kernel);
- _transform_weights_kernel = std::move(transform_weights_kernel);
- _transform_output_kernel = std::move(transform_output_kernel);
- _batched_gemm_kernel = std::move(batched_gemm_kernel);
-}
-
-void NEWinogradLayer::run()
-{
- _memory_group.acquire();
- if(!_reshaped_kernel)
- {
- _reshaped_kernel = true;
- _permute_weights.run();
- NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
- }
- //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
- _permute_input.run();
-
- // Transform input tensor to the winograd domain
- NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
-
- //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
- NEScheduler::get().schedule(_batched_gemm_kernel.get(), Window::DimX);
-
- // Transform output tensor to the spatial domain
- NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
-
- // Reorder the convoluted output to ACL's ordering NCHW
- _permute_output.run();
- _memory_group.release();
-}
-
-Status NEWinogradLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, biases, output);
- ARM_COMPUTE_RETURN_ERROR_ON(validate_arguments(input, weights, biases, output, conv_info));
-
- return Status{};
-}
-
-} // namespace arm_compute
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index c6802f3..795c96c 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/CPUUtils.h"
#include <omp.h>
@@ -41,6 +42,7 @@
OMPScheduler::OMPScheduler() // NOLINT
: _num_threads(omp_get_max_threads())
{
+ get_cpu_configuration(_cpu_info);
}
unsigned int OMPScheduler::num_threads() const
@@ -59,7 +61,7 @@
ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
ThreadInfo info;
- info.cpu_info = _info;
+ info.cpu_info = &_cpu_info;
const Window &max_window = kernel->window();
const unsigned int num_iterations = max_window.num_iterations(split_dimension);
@@ -74,7 +76,7 @@
#pragma omp parallel firstprivate(info) num_threads(info.num_threads)
{
const int tid = omp_get_thread_num();
- Window win = max_window.split_window(split_dimension, tid, info.num_threads);
+ Window win = max_window.split_window(split_dimension, tid, info.num_threads);
info.thread_id = tid;
kernel->run(win, info);
}
diff --git a/src/runtime/OffsetLifetimeManager.cpp b/src/runtime/OffsetLifetimeManager.cpp
index 4540aea..d0b3bde 100644
--- a/src/runtime/OffsetLifetimeManager.cpp
+++ b/src/runtime/OffsetLifetimeManager.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -58,19 +58,24 @@
ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
// Update blob size
- size_t max_group_size = std::accumulate(std::begin(_active_elements), std::end(_active_elements), static_cast<size_t>(0), [](size_t s, const Element & e)
+ size_t max_group_size = std::accumulate(std::begin(_free_blobs), std::end(_free_blobs), static_cast<size_t>(0), [](size_t s, const Blob & b)
{
- return s + e.size;
+ return s + b.max_size;
});
_blob = std::max(_blob, max_group_size);
// Calculate group mappings
auto &group_mappings = _active_group->mappings();
size_t offset = 0;
- for(auto &e : _active_elements)
+ for(auto &free_blob : _free_blobs)
{
- group_mappings[e.handle] = offset;
- offset += e.size;
+ for(auto &bound_element_id : free_blob.bound_elements)
+ {
+ ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements));
+ Element &bound_element = _active_elements[bound_element_id];
+ group_mappings[bound_element.handle] = offset;
+ }
+ offset += free_blob.max_size;
ARM_COMPUTE_ERROR_ON(offset > _blob);
}
}
diff --git a/src/runtime/PoolManager.cpp b/src/runtime/PoolManager.cpp
index 42cc943..293241d 100644
--- a/src/runtime/PoolManager.cpp
+++ b/src/runtime/PoolManager.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -72,3 +72,10 @@
// Update semaphore
_sem = arm_compute::support::cpp14::make_unique<arm_compute::Semaphore>(_free_pools.size());
}
+
+size_t PoolManager::num_pools() const
+{
+ std::lock_guard<arm_compute::Mutex> lock(_mtx);
+
+ return _free_pools.size() + _occupied_pools.size();
+}
\ No newline at end of file
diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp
index c5b8f33..b010a32 100644
--- a/src/runtime/SubTensor.cpp
+++ b/src/runtime/SubTensor.cpp
@@ -27,6 +27,11 @@
using namespace arm_compute;
+SubTensor::SubTensor()
+ : _parent(nullptr), _info()
+{
+}
+
SubTensor::SubTensor(ITensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent)
: _parent(nullptr), _info()
{
diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index a0d41b2..993a95b 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryRegion.h"
#include "support/ToolchainSupport.h"
#include <cstddef>
@@ -114,7 +115,7 @@
ARM_COMPUTE_UNUSED(validate_subtensor_shape);
// Copy pointer to buffer
- _memory = Memory(allocator._memory.buffer());
+ _memory = Memory(allocator._memory.region());
// Init tensor info with new dimensions
size_t total_size = parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes();
@@ -126,22 +127,23 @@
uint8_t *TensorAllocator::data() const
{
- return _memory.buffer();
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
}
void TensorAllocator::allocate()
{
- ARM_COMPUTE_ERROR_ON(_memory.buffer() != nullptr);
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() != nullptr);
+
if(_associated_memory_group == nullptr)
{
- _memory = Memory(std::shared_ptr<uint8_t>(new uint8_t[info().total_size()](), [](uint8_t *ptr)
- {
- delete[] ptr;
- }));
+ _memory = Memory(std::make_shared<MemoryRegion>(info().total_size()));
}
else
{
- _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(_memory.handle()), info().total_size());
+ _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(_memory.region()->handle()), info().total_size());
+ _memory.region()->set_size(info().total_size());
}
info().set_is_resizable(false);
}
@@ -154,7 +156,8 @@
arm_compute::Status TensorAllocator::import_memory(Memory memory)
{
- ARM_COMPUTE_RETURN_ERROR_ON(memory.buffer() == nullptr);
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ ARM_COMPUTE_RETURN_ERROR_ON(memory.region()->buffer() == nullptr);
ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
_memory = memory;
info().set_is_resizable(false);
@@ -164,15 +167,17 @@
void TensorAllocator::set_associated_memory_group(MemoryGroup *associated_memory_group)
{
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
- ARM_COMPUTE_ERROR_ON(_memory.buffer() != nullptr);
+ ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() != nullptr);
_associated_memory_group = associated_memory_group;
}
uint8_t *TensorAllocator::lock()
{
- return _memory.buffer();
+ ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+ return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
}
void TensorAllocator::unlock()