arm_compute v19.02
Change-Id: I853a3ecf38f206da13c1b03640c8adf73c20477c
diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp
index 7f0e374..d9de11e 100644
--- a/src/runtime/Allocator.cpp
+++ b/src/runtime/Allocator.cpp
@@ -44,6 +44,5 @@
std::unique_ptr<IMemoryRegion> Allocator::make_region(size_t size, size_t alignment)
{
- ARM_COMPUTE_UNUSED(alignment);
- return arm_compute::support::cpp14::make_unique<MemoryRegion>(size);
+ return arm_compute::support::cpp14::make_unique<MemoryRegion>(size, alignment);
}
\ No newline at end of file
diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
index 2a4ab6e..c5d42b1 100644
--- a/src/runtime/BlobLifetimeManager.cpp
+++ b/src/runtime/BlobLifetimeManager.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -32,7 +32,6 @@
#include <algorithm>
#include <cmath>
#include <map>
-#include <vector>
using namespace arm_compute;
@@ -62,19 +61,21 @@
{
return ba.max_size > bb.max_size;
});
- std::vector<size_t> group_sizes;
+
+ // Create group sizes vector
+ std::vector<BlobInfo> group_sizes;
std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), [](const Blob & b)
{
- return b.max_size;
+ return BlobInfo(b.max_size, b.max_alignment);
});
// Update blob sizes
size_t max_size = std::max(_blobs.size(), group_sizes.size());
- _blobs.resize(max_size, 0);
- group_sizes.resize(max_size, 0);
- std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](size_t lhs, size_t rhs)
+ _blobs.resize(max_size);
+ group_sizes.resize(max_size);
+ std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](BlobInfo lhs, BlobInfo rhs)
{
- return std::max(lhs, rhs);
+ return BlobInfo(std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment));
});
// Calculate group mappings
diff --git a/src/runtime/BlobMemoryPool.cpp b/src/runtime/BlobMemoryPool.cpp
index e09451c..812cbdd 100644
--- a/src/runtime/BlobMemoryPool.cpp
+++ b/src/runtime/BlobMemoryPool.cpp
@@ -33,11 +33,11 @@
using namespace arm_compute;
-BlobMemoryPool::BlobMemoryPool(IAllocator *allocator, std::vector<size_t> blob_sizes)
- : _allocator(allocator), _blobs(), _blob_sizes(std::move(blob_sizes))
+BlobMemoryPool::BlobMemoryPool(IAllocator *allocator, std::vector<BlobInfo> blob_info)
+ : _allocator(allocator), _blobs(), _blob_info(std::move(blob_info))
{
ARM_COMPUTE_ERROR_ON(!allocator);
- allocate_blobs(_blob_sizes);
+ allocate_blobs(_blob_info);
}
BlobMemoryPool::~BlobMemoryPool()
@@ -73,16 +73,16 @@
std::unique_ptr<IMemoryPool> BlobMemoryPool::duplicate()
{
ARM_COMPUTE_ERROR_ON(!_allocator);
- return support::cpp14::make_unique<BlobMemoryPool>(_allocator, _blob_sizes);
+ return support::cpp14::make_unique<BlobMemoryPool>(_allocator, _blob_info);
}
-void BlobMemoryPool::allocate_blobs(const std::vector<size_t> &sizes)
+void BlobMemoryPool::allocate_blobs(const std::vector<BlobInfo> &blob_info)
{
ARM_COMPUTE_ERROR_ON(!_allocator);
- for(const auto &size : sizes)
+ for(const auto &bi : blob_info)
{
- _blobs.push_back(_allocator->make_region(size, 0));
+ _blobs.push_back(_allocator->make_region(bi.size, bi.alignment));
}
}
diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp
new file mode 100644
index 0000000..533e6fa
--- /dev/null
+++ b/src/runtime/CL/CLHelpers.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/CLHelpers.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/Error.h"
+
+namespace
+{
+#if defined(ARM_COMPUTE_ASSERTS_ENABLED)
+void printf_callback(const char *buffer, unsigned int len, size_t complete, void *user_data)
+{
+ printf("%.*s", len, buffer);
+}
+#endif /* defined(ARM_COMPUTE_ASSERTS_ENABLED) */
+
+/** This initialises the properties vector with the configuration to be used when creating the opencl context
+ *
+ * @param[in] platform The opencl platform used to create the context
+ * @param[in] device The opencl device to be used to create the context
+ * @param[in] prop An array of properties to be initialised
+ *
+ * @note In debug builds, this function will enable cl_arm_printf if it's supported.
+ *
+ * @return A pointer to the context properties which can be used to create an opencl context
+ */
+
+void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, cl_context_properties prop[7])
+{
+ ARM_COMPUTE_UNUSED(device);
+#if defined(ARM_COMPUTE_ASSERTS_ENABLED)
+ // Query devices in the context for cl_arm_printf support
+ if(arm_compute::device_supports_extension(device, "cl_arm_printf"))
+ {
+ // Create a cl_context with a printf_callback and user specified buffer size.
+ cl_context_properties properties_printf[] =
+ {
+ CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
+ // Enable a printf callback function for this context.
+ CL_PRINTF_CALLBACK_ARM, reinterpret_cast<cl_context_properties>(printf_callback),
+ // Request a minimum printf buffer size of 4MB for devices in the
+ // context that support this extension.
+ CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
+ 0
+ };
+ std::copy_n(properties_printf, 7, prop);
+ }
+ else
+#endif // defined(ARM_COMPUTE_ASSERTS_ENABLED)
+ {
+ cl_context_properties properties[] =
+ {
+ CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
+ 0
+ };
+ std::copy_n(properties, 3, prop);
+ };
+}
+} //namespace
+
+namespace arm_compute
+{
+std::tuple<cl::Context, cl::Device, cl_int>
+create_opencl_context_and_device()
+{
+ ARM_COMPUTE_ERROR_ON(!opencl_is_available());
+ std::vector<cl::Platform> platforms;
+ cl::Platform::get(&platforms);
+ ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform");
+ cl::Platform p = platforms[0];
+ cl::Device device;
+ std::vector<cl::Device> platform_devices;
+ p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices);
+ ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device");
+ device = platform_devices[0];
+ cl_int err = CL_SUCCESS;
+ cl_context_properties properties[7] = { 0, 0, 0, 0, 0, 0, 0 };
+ initialise_context_properties(p, device, properties);
+ cl::Context cl_context = cl::Context(device, properties, nullptr, nullptr, &err);
+ ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
+ return std::make_tuple(cl_context, device, err);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index a311c6f..701ffe0 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,22 +23,14 @@
*/
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLHelpers.h"
+
#include "arm_compute/core/CL/ICLKernel.h"
#include "arm_compute/runtime/CL/CLTuner.h"
#include "arm_compute/runtime/CL/tuners/Tuners.h"
using namespace arm_compute;
-namespace
-{
-#if defined(ARM_COMPUTE_DEBUG_ENABLED)
-void printf_callback(const char *buffer, unsigned int len, size_t complete, void *user_data)
-{
- printf("%.*s", len, buffer);
-}
-#endif /* defined(ARM_COMPUTE_DEBUG_ENABLED) */
-} // namespace
-
std::once_flag CLScheduler::_initialize_symbols;
CLScheduler::CLScheduler()
@@ -53,53 +45,30 @@
return scheduler;
}
+void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner)
+{
+ if(!_is_initialised)
+ {
+ cl::CommandQueue queue = cl::CommandQueue(ctx, device);
+ CLKernelLibrary::get().init("./cl_kernels/", ctx, device);
+ init(ctx, queue, device, cl_tuner);
+ _cl_default_static_tuner = tuners::TunerFactory::create_tuner(_target);
+ _cl_tuner = (cl_tuner == nullptr) ? _cl_default_static_tuner.get() : cl_tuner;
+ }
+}
+
void CLScheduler::default_init(ICLTuner *cl_tuner)
{
if(!_is_initialised)
{
- std::vector<cl::Platform> platforms;
- cl::Platform::get(&platforms);
- ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform");
- cl::Platform p = platforms[0];
- cl::Context ctx;
- cl::Device device;
- std::vector<cl::Device> platform_devices;
- p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices);
- ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device");
- device = platform_devices[0];
-#if defined(ARM_COMPUTE_DEBUG_ENABLED)
-
- // Query devices in the context for cl_arm_printf support
- if(device_supports_extension(device, "cl_arm_printf"))
- {
- // Create a cl_context with a printf_callback and user specified buffer size.
- cl_context_properties properties[] =
- {
- CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(p()),
- // Enable a printf callback function for this context.
- CL_PRINTF_CALLBACK_ARM, reinterpret_cast<cl_context_properties>(printf_callback),
- // Request a minimum printf buffer size of 4MB for devices in the
- // context that support this extension.
- CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
- 0
- };
- ctx = cl::Context(device, properties);
- }
- else
-#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
- {
- cl_context_properties properties[] =
- {
- CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(p()),
- 0
- };
- ctx = cl::Context(device, properties);
- };
-
- cl::CommandQueue queue = cl::CommandQueue(ctx, device);
- CLKernelLibrary::get().init("./cl_kernels/", ctx, device);
- init(ctx, queue, device, cl_tuner);
-
+ cl::Context ctx;
+ cl::Device dev;
+ cl_int err;
+ std::tie(ctx, dev, err) = create_opencl_context_and_device();
+ ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
+ cl::CommandQueue queue = cl::CommandQueue(ctx, dev);
+ CLKernelLibrary::get().init("./cl_kernels/", ctx, dev);
+ init(ctx, queue, dev, cl_tuner);
// Create a default static tuner and set if none was provided
_cl_default_static_tuner = tuners::TunerFactory::create_tuner(_target);
}
@@ -108,6 +77,21 @@
_cl_tuner = (cl_tuner == nullptr) ? _cl_default_static_tuner.get() : cl_tuner;
}
+void CLScheduler::set_context(cl::Context context)
+{
+ _context = std::move(context);
+ CLKernelLibrary::get().set_context(_context);
+}
+
+void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner)
+{
+ set_context(std::move(context));
+ _queue = std::move(queue);
+ _target = get_target_from_device(device);
+ _is_initialised = true;
+ _cl_tuner = cl_tuner;
+}
+
void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
{
ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised,
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 5f82cd3..a262d6b 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,10 +33,40 @@
#include <limits>
#include <string>
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+/** Utility function used to initialize the LWS values to test.
+ * Only the LWS values which are power of 2 or satisfy the modulo conditions with GWS are taken into account by the CLTuner
+ *
+ * @param[in, out] lws Vector of LWS to test for a specific dimension
+ * @param[in] gws Size of the GWS
+ * @param[in] lws_max Max LKWS value allowed to be tested
+ * @param[in] mod_let_one True if the results of the modulo operation between gws and the lws can be less than one.
+ */
+void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
+{
+ lws.push_back(1);
+
+ for(unsigned int i = 2; i <= lws_max; ++i)
+ {
+ // Power of two condition
+ const bool is_power_of_two = (i & (i - 1)) == 0;
+
+ // Condition for the module accordingly with the mod_let_one flag
+ const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
+
+ if(mod_cond || is_power_of_two)
+ {
+ lws.push_back(i);
+ }
+ }
+}
+} // namespace
CLTuner::CLTuner(bool tune_new_kernels)
- : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
+ : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
{
}
@@ -102,32 +132,35 @@
cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
{
+ // Profiling queue
+ cl::CommandQueue queue_profiler;
+
+ // Extract real OpenCL function to intercept
if(real_clEnqueueNDRangeKernel == nullptr)
{
real_clEnqueueNDRangeKernel = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
-
- // Get the default queue
- _queue = CLScheduler::get().queue();
-
- // Check if we can use the OpenCL timer with the default queue
- cl_command_queue_properties props = _queue.getInfo<CL_QUEUE_PROPERTIES>();
-
- if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
- {
- // Set the queue for profiling
- _queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE);
- }
- else
- {
- _queue_profiler = _queue;
- }
}
+
+ // Get the default queue
+ cl::CommandQueue default_queue = CLScheduler::get().queue();
+
+ // Check if we can use the OpenCL timer with the default queue
+ cl_command_queue_properties props = default_queue.getInfo<CL_QUEUE_PROPERTIES>();
+
+ if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
+ {
+ // Set the queue for profiling
+ queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE);
+ }
+ else
+ {
+ queue_profiler = default_queue;
+ }
+
// Start intercepting enqueues:
auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
const cl_event * event_wait_list, cl_event * event)
{
- ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported");
- ARM_COMPUTE_UNUSED(event);
if(this->kernel_event_is_set())
{
// If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues.
@@ -139,49 +172,45 @@
// Set OpenCL event
this->set_cl_kernel_event(tmp);
+ if(event != nullptr)
+ {
+ //return cl_event from the intercepted call
+ clRetainEvent(tmp);
+ *event = tmp;
+ }
return retval;
};
CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
cl_ulong min_exec_time = std::numeric_limits<cl_ulong>::max();
+ cl::NDRange gws = ICLKernel::gws_from_window(kernel.window());
cl::NDRange opt_lws = cl::NullRange;
- const int x_step = std::max(1, kernel.window().x().step());
- const int y_step = std::max(1, kernel.window().y().step());
- const int z_step = std::max(1, kernel.window().z().step());
- const int x_end = kernel.window().x().end() - kernel.window().x().start() / x_step > 1 ? 16 : 1;
- const int y_end = kernel.window().y().end() - kernel.window().y().start() / y_step > 1 ? 16 : 1;
- const int z_end = kernel.window().z().end() - kernel.window().z().start() / z_step > 1 ? 8 : 1;
+ const unsigned int lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 64u);
+ const unsigned int lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 32u);
+ const unsigned int lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 32u);
- // First run using the default LWS
+ std::vector<unsigned int> lws_x;
+ std::vector<unsigned int> lws_y;
+ std::vector<unsigned int> lws_z;
+
+ // Initialize the LWS values to test
+ initialize_lws_values(lws_x, gws[0], lws_x_max, gws[2] > 16);
+ initialize_lws_values(lws_y, gws[1], lws_y_max, gws[2] > 16);
+ initialize_lws_values(lws_z, gws[2], lws_z_max, false);
+
+ for(const auto &z : lws_z)
{
- cl::NDRange lws_test = cl::NullRange;
-
- kernel.set_lws_hint(lws_test);
-
- // Run the kernel
- kernel.run(kernel.window(), _queue_profiler);
-
- _queue_profiler.finish();
-
- const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
- const cl_ulong end = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
- const cl_ulong diff = end - start;
- _kernel_event = nullptr;
-
- min_exec_time = diff;
- }
-
- for(int z = 1; z <= z_end; ++z)
- {
- for(int y = 1; y <= y_end; ++y)
+ for(const auto &y : lws_y)
{
- for(int x = 1; x <= x_end; ++x)
+ for(const auto &x : lws_x)
{
cl::NDRange lws_test = cl::NDRange(x, y, z);
- const bool invalid_lws = (x * y * z > static_cast<int>(kernel.get_max_workgroup_size())) || (x == 1 && y == 1 && z == 1);
+ bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
+
+ invalid_lws = invalid_lws || (x > gws[0]) || (y > gws[1]) || (z > gws[2]);
if(invalid_lws)
{
@@ -192,9 +221,9 @@
kernel.set_lws_hint(lws_test);
// Run the kernel
- kernel.run(kernel.window(), _queue_profiler);
+ kernel.run(kernel.window(), queue_profiler);
- _queue_profiler.finish();
+ queue_profiler.finish();
const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
const cl_ulong end = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
@@ -278,3 +307,4 @@
}
fs.close();
}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
new file mode 100644
index 0000000..a6393c5
--- /dev/null
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel>();
+ k->configure(input, output, axis, op);
+ _kernel = std::move(k);
+}
+
+Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
+ return CLReductionOperationKernel::validate(input, output, axis, op);
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLArithmeticAddition.cpp b/src/runtime/CL/functions/CLArithmeticAddition.cpp
deleted file mode 100644
index 0b05058..0000000
--- a/src/runtime/CL/functions/CLArithmeticAddition.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
-{
- auto k = arm_compute::support::cpp14::make_unique<CLArithmeticAdditionKernel>();
- k->configure(input1, input2, output, policy);
- _kernel = std::move(k);
-
- if(output->info()->dimension(0) > 1)
- {
- ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
- if(broadcasted_info->info()->dimension(0) == 1)
- {
- _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
- }
- }
-}
-
-Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
-{
- return CLArithmeticAdditionKernel::validate(input1, input2, output, policy);
-}
diff --git a/src/runtime/CL/functions/CLArithmeticDivision.cpp b/src/runtime/CL/functions/CLArithmeticDivision.cpp
deleted file mode 100644
index 1c2849c..0000000
--- a/src/runtime/CL/functions/CLArithmeticDivision.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLArithmeticDivision.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLArithmeticDivisionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
-{
- auto k = arm_compute::support::cpp14::make_unique<CLArithmeticDivisionKernel>();
- k->configure(input1, input2, output);
- _kernel = std::move(k);
-
- if(output->info()->dimension(0) > 1)
- {
- ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
- if(broadcasted_info->info()->dimension(0) == 1)
- {
- _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
- }
- }
-}
-
-Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
- return CLArithmeticDivisionKernel::validate(input1, input2, output);
-}
diff --git a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
deleted file mode 100644
index e661f6a..0000000
--- a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
-{
- auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionKernel>();
- k->configure(input1, input2, output, policy);
- _kernel = std::move(k);
-
- if(output->info()->dimension(0) > 1)
- {
- ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
- if(broadcasted_info->info()->dimension(0) == 1)
- {
- _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
- }
- }
-}
-
-Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
-{
- return CLArithmeticSubtractionKernel::validate(input1, input2, output, policy);
-}
diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
new file mode 100644
index 0000000..e0ffcdb
--- /dev/null
+++ b/src/runtime/CL/functions/CLCast.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLCast.h"
+
+#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
+ k->configure(input, output, policy, 0);
+ _kernel = std::move(k);
+}
+
+Status CLCast::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy)
+{
+ return CLDepthConvertLayerKernel::validate(input, output, policy, 0);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp
new file mode 100644
index 0000000..86c9c31
--- /dev/null
+++ b/src/runtime/CL/functions/CLComparison.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLComparison.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLComparisonKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>();
+ k->configure(input1, input2, output, operation);
+ _kernel = std::move(k);
+
+ if(output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if(broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
+
+Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+{
+ return CLComparisonKernel::validate(input1, input2, output, operation);
+}
+
+template <ComparisonOperation COP>
+void CLComparisonStatic<COP>::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLComparisonKernel>();
+ k->configure(input1, input2, output, COP);
+ _kernel = std::move(k);
+
+ if(output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if(broadcasted_info->info()->dimension(0) == 1)
+ {
+ _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ }
+ }
+}
+
+template <ComparisonOperation COP>
+Status CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return CLComparisonKernel::validate(input1, input2, output, COP);
+}
+
+// Supported Specializations
+template class CLComparisonStatic<ComparisonOperation::Equal>;
+template class CLComparisonStatic<ComparisonOperation::NotEqual>;
+template class CLComparisonStatic<ComparisonOperation::Greater>;
+template class CLComparisonStatic<ComparisonOperation::GreaterEqual>;
+template class CLComparisonStatic<ComparisonOperation::Less>;
+template class CLComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLComputeAllAnchors.cpp b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
index 409d3c9..24c152f 100644
--- a/src/runtime/CL/functions/CLComputeAllAnchors.cpp
+++ b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index e07feb2..9da02c1 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -158,6 +158,18 @@
_scaled_output.allocator()->allocate();
}
+void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+ const WeightsInfo &weights_info)
+{
+ configure(input, weights, bias, output, info, 0, 0, weights_info);
+}
+
+Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+ const WeightsInfo &weights_info)
+{
+ return CLDeconvolutionLayer::validate(input, weights, bias, output, info, 0, 0, weights_info);
+}
+
void CLDeconvolutionLayer::run()
{
prepare();
diff --git a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
index b5e8fd9..e46647a 100644
--- a/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -66,7 +66,7 @@
for(unsigned int i = 0; i < _num_inputs; i++)
{
_concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
- _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue());
depth_offset += inputs_vector.at(i)->info()->dimension(2);
}
diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index 2e52e8a..dbf71ac 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
@@ -28,8 +28,8 @@
#include <utility>
-using namespace arm_compute;
-
+namespace arm_compute
+{
void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
{
auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
@@ -41,3 +41,4 @@
{
return CLDepthConvertLayerKernel::validate(input, output, policy, shift);
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 497cdae..15cbfce 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,18 +26,21 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
+#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
+namespace arm_compute
+{
using namespace arm_compute::misc;
using namespace arm_compute::misc::shape_calculator;
-CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3()
- : _kernel(nullptr), _border_handler()
+CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _kernel(nullptr), _border_handler(), _permute_input_to_nchw(), _permute_weights_to_nchw(), _permute_output_to_nhwc(), _reshape_weights(), _permuted_input(),
+ _permuted_weights(), _permuted_output(), _original_weights(nullptr), _needs_permute(false), _needs_weights_reshape(false), _is_prepared(false)
{
}
@@ -47,25 +50,79 @@
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- if(input->info()->data_layout() == DataLayout::NCHW)
+ const bool is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
+
+ _needs_permute = is_nhwc && (depth_multiplier > 1);
+ _needs_weights_reshape = is_nhwc && (depth_multiplier == 1)
+ && is_data_type_quantized_asymmetric(input->info()->data_type());
+ _is_prepared = false;
+ _original_weights = weights;
+
+ ICLTensor *input_to_use = input;
+ const ICLTensor *weights_to_use = weights;
+ ICLTensor *output_to_use = output;
+
+ const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+ const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+ DepthwiseConvolutionReshapeInfo info;
+ info.c0 = 4;
+ info.transpose = is_stride_1 && is_dot8_supported;
+
+ if(_needs_permute)
{
+ _memory_group.manage(&_permuted_input);
+ _memory_group.manage(&_permuted_output);
+
+ // Configure the function to transform the input tensor from NHWC -> NCHW
+ _permute_input_to_nchw.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+ // Configure the function to transform the weights tensor from HWI -> IHW
+ _permute_weights_to_nchw.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+ input_to_use = &_permuted_input;
+ weights_to_use = &_permuted_weights;
+ output_to_use = &_permuted_output;
+
_kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
}
+ else if(is_nhwc)
+ {
+ if(_needs_weights_reshape)
+ {
+ _reshape_weights.configure(weights, &_permuted_weights, info);
+ weights_to_use = &_permuted_weights;
+ }
+ _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
+ }
else
{
- _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
+ _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
}
+ // Configure kernel
_kernel->set_target(CLScheduler::get().target());
- _kernel->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+ _kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, act_info);
+ // Permute output if needed
+ if(_needs_permute)
+ {
+ // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+ _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+ _permute_output_to_nhwc.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+ // Allocate tensors
+ _permuted_input.allocator()->allocate();
+ _permuted_output.allocator()->allocate();
+ }
// Configure border handler
PixelValue &&zero_value(0.f);
if(is_data_type_quantized_asymmetric(input->info()->data_type()))
{
zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
}
- _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
+ _border_handler.configure(input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
}
Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -75,23 +132,99 @@
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
- if(input->data_layout() == DataLayout::NCHW)
+ const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
+ const bool needs_permute = is_nhwc && (depth_multiplier > 1);
+ const bool needs_weights_reshape = is_nhwc && (depth_multiplier == 1);
+ const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+ const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+ DepthwiseConvolutionReshapeInfo info;
+ info.c0 = 4;
+ info.transpose = is_stride_1 && is_dot8_supported;
+
+ if(needs_permute)
{
- return CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target);
+ TensorShape permuted_input_shape = input->tensor_shape();
+ TensorShape permuted_weights_shape = weights->tensor_shape();
+ TensorShape permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+
+ permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
+ permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
+ permute(permuted_output_shape, PermutationVector(1U, 2U, 0U));
+
+ const TensorInfo permuted_input = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW);
+ const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW);
+ const TensorInfo permuted_output = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, conv_info, depth_multiplier, act_info, gpu_target));
+ }
+ else if(is_nhwc)
+ {
+ if(needs_weights_reshape)
+ {
+ auto reshaped_weights_shape = arm_compute::misc::shape_calculator::compute_reshaped_depthwise_weights_shape(*weights, info);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, output, conv_info, depth_multiplier,
+ act_info));
+ }
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target));
}
- return CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+ return Status{};
}
void CLDepthwiseConvolutionLayer3x3::run()
{
+ prepare();
+
+ _memory_group.acquire();
+
+ if(_needs_permute)
+ {
+ _permute_input_to_nchw.run();
+ }
CLScheduler::get().enqueue(_border_handler);
CLScheduler::get().enqueue(*_kernel);
+
+ if(_needs_permute)
+ {
+ _permute_output_to_nhwc.run();
+ }
+
+ _memory_group.release();
+}
+
+void CLDepthwiseConvolutionLayer3x3::prepare()
+{
+ if(!_is_prepared)
+ {
+ if(_needs_permute)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ _permuted_weights.allocator()->allocate();
+ _permute_weights_to_nchw.run();
+ _original_weights->mark_as_unused();
+ }
+
+ if(_needs_weights_reshape)
+ {
+ ARM_COMPUTE_ERROR_ON(_needs_permute);
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+ _permuted_weights.allocator()->allocate();
+ CLScheduler::get().enqueue(_reshape_weights);
+ _original_weights->mark_as_unused();
+ }
+ _is_prepared = true;
+ }
}
CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
: _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _activationlayer_function(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(),
- _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
+ _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _is_activationlayer_enabled(false), _original_weights(nullptr),
+ _optimised_function(nullptr)
{
}
@@ -104,98 +237,110 @@
const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
- const size_t weights_w = weights->info()->dimension(idx_w);
- const size_t weights_h = weights->info()->dimension(idx_h);
- const size_t weights_z = weights->info()->dimension(idx_c);
+ const bool can_run_optimised_3x3_kernel = (weights->info()->dimension(idx_w) == 3) && (weights->info()->dimension(idx_h) == 3);
- _is_prepared = false;
- _original_weights = weights;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
-
- bool append_bias = (biases != nullptr) && !_is_quantized;
- const GPUTarget gpu_target = CLScheduler::get().target();
-
- // Calculate output shape
- TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-
- // Output width and height
- const unsigned int conv_w = output_shape[idx_w];
- const unsigned int conv_h = output_shape[idx_h];
-
- // Set up intermediate tensors
- const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
- const size_t conv_size = conv_w * conv_h;
-
- // Im2Col configuration
- TensorShape shape_im2col = input->info()->tensor_shape();
- shape_im2col.set(0, patch_size);
- shape_im2col.set(1, conv_size);
- shape_im2col.set(2, weights_z);
- _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
- _im2col_kernel.set_target(gpu_target);
- _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
- CLScheduler::get().tune_kernel_static(_im2col_kernel);
-
- // Weights reshape configuration
- const TensorShape shape_weights_reshape(patch_size, weights_z);
- _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
- _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
-
- // GEMV configuration
- DataType v2mm_dt = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
- TensorShape shape_v2mm_out = input->info()->tensor_shape();
- shape_v2mm_out.set(0, conv_size * weights_z);
- shape_v2mm_out.set(1, 1);
- shape_v2mm_out.set(2, 1);
- _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
- _v2mm_kernel.set_target(gpu_target);
- _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
- CLScheduler::get().tune_kernel_static(_v2mm_kernel);
- _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
- _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
-
- // Output staged configuration
- if(_is_quantized)
+ if(bool(can_run_optimised_3x3_kernel))
{
- const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
-
- float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
- int output_multiplier, output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
- _output_reshaped.allocator()->allocate();
+ auto f = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3>();
+ f->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
+ _optimised_function = std::move(f);
}
-
- // Fill borders on inputs
- PixelValue zero_in(static_cast<int32_t>(0));
- PixelValue zero_w(static_cast<int32_t>(0));
- if(_is_quantized)
+ else
{
- zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().offset));
- zero_w = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().offset));
- }
- BorderSize border_size = _v2mm_kernel.border_size();
- _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
+ const size_t idx_c = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
- border_size.bottom = 0;
- _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
+ const size_t weights_w = weights->info()->dimension(idx_w);
+ const size_t weights_h = weights->info()->dimension(idx_h);
+ const size_t weights_z = weights->info()->dimension(idx_c);
- // Allocate intermediate tensors
- _input_reshaped.allocator()->allocate();
- _v2mm_output.allocator()->allocate();
+ _is_prepared = false;
+ _original_weights = weights;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- //Configure Activation Layer
- _is_activationlayer_enabled = act_info.enabled();
+ bool append_bias = (biases != nullptr) && !_is_quantized;
+ const GPUTarget gpu_target = CLScheduler::get().target();
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.configure(output, nullptr, act_info);
+ // Calculate output shape
+ TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+ // Output width and height
+ const unsigned int conv_w = output_shape[idx_w];
+ const unsigned int conv_h = output_shape[idx_h];
+
+ // Set up intermediate tensors
+ const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
+ const size_t conv_size = conv_w * conv_h;
+
+ // Im2Col configuration
+ TensorShape shape_im2col = input->info()->tensor_shape();
+ shape_im2col.set(0, patch_size);
+ shape_im2col.set(1, conv_size);
+ shape_im2col.set(2, weights_z);
+ _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+ _im2col_kernel.set_target(gpu_target);
+ _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+ CLScheduler::get().tune_kernel_static(_im2col_kernel);
+
+ // Weights reshape configuration
+ const TensorShape shape_weights_reshape(patch_size, weights_z);
+ _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+ _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
+
+ // GEMV configuration
+ DataType v2mm_dt = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
+ TensorShape shape_v2mm_out = input->info()->tensor_shape();
+ shape_v2mm_out.set(0, conv_size * weights_z);
+ shape_v2mm_out.set(1, 1);
+ shape_v2mm_out.set(2, 1);
+ _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+ _v2mm_kernel.set_target(gpu_target);
+ _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
+ CLScheduler::get().tune_kernel_static(_v2mm_kernel);
+ _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+ _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
+
+ // Output staged configuration
+ if(_is_quantized)
+ {
+ const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+ float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+ int output_multiplier, output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
+ _output_reshaped.allocator()->allocate();
+ }
+
+ // Fill borders on inputs
+ PixelValue zero_in(static_cast<int32_t>(0));
+ PixelValue zero_w(static_cast<int32_t>(0));
+ if(_is_quantized)
+ {
+ zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().offset));
+ zero_w = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().offset));
+ }
+ BorderSize border_size = _v2mm_kernel.border_size();
+ _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
+
+ border_size.bottom = 0;
+ _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
+
+ // Allocate intermediate tensors
+ _input_reshaped.allocator()->allocate();
+ _v2mm_output.allocator()->allocate();
+
+ //Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.configure(output, nullptr, act_info);
+ }
}
}
@@ -204,55 +349,64 @@
{
const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != weights->dimension(idx_c));
+ const bool can_run_optimised_3x3_kernel = (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3);
- const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
- const bool append_bias = (biases != nullptr) && !is_quantized;
- const TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
- const size_t weights_w = weights->dimension(idx_w);
- const size_t weights_h = weights->dimension(idx_h);
- const size_t weights_z = weights->dimension(idx_c);
- const unsigned int conv_w = output_shape[idx_w];
- const unsigned int conv_h = output_shape[idx_h];
- const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
- const size_t conv_size = conv_w * conv_h;
-
- TensorShape shape_im2col = input->tensor_shape();
- shape_im2col.set(0, patch_size);
- shape_im2col.set(1, conv_size);
- shape_im2col.set(2, weights_z);
- TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
-
- const TensorShape shape_weights_reshape(patch_size, weights_z);
- TensorInfo weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseWeightsReshapeKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
-
- DataType v2mm_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
- TensorShape shape_v2mm_out = input->tensor_shape();
- shape_v2mm_out.set(0, conv_size * weights_z);
- shape_v2mm_out.set(1, 1);
- shape_v2mm_out.set(2, 1);
- TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
-
- TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
- ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
-
- if(is_quantized)
+ if(can_run_optimised_3x3_kernel)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
- }
+ const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
- // Validate Activation Layer
- if(act_info.enabled())
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(idx_c) * depth_multiplier) != weights->dimension(idx_c));
+
+ const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+ const bool append_bias = (biases != nullptr) && !is_quantized;
+ const TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ const size_t weights_w = weights->dimension(idx_w);
+ const size_t weights_h = weights->dimension(idx_h);
+ const size_t weights_z = weights->dimension(idx_c);
+ const unsigned int conv_w = output_shape[idx_w];
+ const unsigned int conv_h = output_shape[idx_h];
+ const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
+ const size_t conv_size = conv_w * conv_h;
+
+ TensorShape shape_im2col = input->tensor_shape();
+ shape_im2col.set(0, patch_size);
+ shape_im2col.set(1, conv_size);
+ shape_im2col.set(2, weights_z);
+ TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+
+ const TensorShape shape_weights_reshape(patch_size, weights_z);
+ TensorInfo weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerReshapeWeightsGenericKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
+
+ DataType v2mm_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+ TensorShape shape_v2mm_out = input->tensor_shape();
+ shape_v2mm_out.set(0, conv_size * weights_z);
+ shape_v2mm_out.set(1, 1);
+ shape_v2mm_out.set(2, 1);
+ TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
+
+ TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
+
+ if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
+ }
+
+ // Validate Activation Layer
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+ }
+ }
+ else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+ CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
}
-
return Status{};
}
@@ -260,33 +414,48 @@
{
prepare();
- CLScheduler::get().enqueue(_im2col_kernel);
- CLScheduler::get().enqueue(_v2mm_input_fill_border);
- CLScheduler::get().enqueue(_v2mm_kernel);
- CLScheduler::get().enqueue(_vector_to_tensor_kernel);
- if(_is_quantized)
+ if(_optimised_function != nullptr)
{
- CLScheduler::get().enqueue(_output_stage_kernel);
+ _optimised_function->run();
}
- if(_is_activationlayer_enabled)
+ else
{
- _activationlayer_function.run();
+ CLScheduler::get().enqueue(_im2col_kernel);
+ CLScheduler::get().enqueue(_v2mm_input_fill_border);
+ CLScheduler::get().enqueue(_v2mm_kernel);
+ CLScheduler::get().enqueue(_vector_to_tensor_kernel);
+ if(_is_quantized)
+ {
+ CLScheduler::get().enqueue(_output_stage_kernel);
+ }
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function.run();
+ }
}
}
void CLDepthwiseConvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if(_optimised_function != nullptr)
{
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+ _optimised_function->prepare();
+ }
+ else
+ {
+ if(!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
- // Run weights reshaping and mark original weights tensor as unused
- _weights_reshaped.allocator()->allocate();
- CLScheduler::get().enqueue(_weights_reshape_kernel);
- CLScheduler::get().enqueue(_v2mm_weights_fill_border);
- _original_weights->mark_as_unused();
+ // Run weights reshaping and mark original weights tensor as unused
+ _weights_reshaped.allocator()->allocate();
+ CLScheduler::get().enqueue(_weights_reshape_kernel);
+ CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+ _original_weights->mark_as_unused();
- CLScheduler::get().queue().finish();
- _is_prepared = true;
+ CLScheduler::get().queue().finish();
+ _is_prepared = true;
+ }
}
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
new file mode 100644
index 0000000..b7e9a68
--- /dev/null
+++ b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void CLRsqrtLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+ k->configure(input, output, ElementWiseUnary::RSQRT);
+ _kernel = std::move(k);
+}
+Status CLRsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::RSQRT);
+}
+
+void CLExpLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+ k->configure(input, output, ElementWiseUnary::EXP);
+ _kernel = std::move(k);
+}
+Status CLExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::EXP);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
new file mode 100644
index 0000000..28f4b13
--- /dev/null
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "support/ToolchainSupport.h"
+#include <arm_compute/runtime/CL/functions/CLElementwiseOperations.h>
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace
+{
+void configure_border_handler(CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
+{
+ if(output->info()->dimension(0) > 1)
+ {
+ ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+ if(broadcasted_info->info()->dimension(0) == 1)
+ {
+ border_handler.configure(broadcasted_info, border_size, BorderMode::REPLICATE);
+ }
+ }
+}
+} // namespace
+
+void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::ADD, input1, input2, output, policy);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+ return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, input1, input2, output, policy);
+}
+
+void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::SUB, input1, input2, output, policy);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+ ARM_COMPUTE_UNUSED(policy);
+ return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, input1, input2, output, policy);
+}
+
+void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::DIV, input1, input2, output);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return CLArithmeticOperationKernel::validate(ArithmeticOperation::DIV, input1, input2, output);
+}
+
+void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::MAX, input1, input2, output);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return CLArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output);
+}
+
+void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::MIN, input1, input2, output);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return CLArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output);
+}
+
+void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+ _kernel = std::move(k);
+ configure_border_handler(_border_handler, _kernel->border_size(), input1, input2, output);
+}
+
+Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return CLArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index baa0cf4..e91038f 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,32 +33,42 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
#include "arm_compute/runtime/ITensorAllocator.h"
-using namespace arm_compute;
+namespace arm_compute
+{
using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_gemm;
namespace
{
-inline bool is_interleaved_transposed(int m, int n, int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+inline bool is_interleaved_transposed(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
{
bool flag = true;
if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
{
- // COMPMID-852
- if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+ if((m > 1) && n < 16)
{
- constexpr float alpha = 3.2f;
- constexpr float fact0 = 1.51f;
- constexpr float fact1 = 1.66f;
- constexpr float ops = 12.0f;
- const float scale = k > 1024 ? 1.07f : 1.0f;
- flag = alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops);
+ flag = true;
}
else
{
- flag = false;
+ // COMPMID-852
+ if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+ {
+ constexpr float alpha = 3.2f;
+ constexpr float fact0 = 1.51f;
+ constexpr float fact1 = 1.66f;
+ constexpr float ops = 12.0f;
+ const float scale = k > 1024 ? 1.07f : 1.0f;
+ flag = alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops);
+ }
+ else
+ {
+ flag = false;
+ }
}
}
else
@@ -73,17 +83,19 @@
CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)),
- _interleave_kernel(),
- _transpose_kernel(),
_mm_kernel(),
_ma_kernel(),
+ _reshape_lhs_kernel(),
+ _reshape_rhs_kernel(),
+ _mm_reshaped_kernel(),
_tmp_a(),
_tmp_b(),
_original_b(nullptr),
_is_interleaved_transposed(false),
_run_addition(false),
_reshape_b_only_on_first_run(false),
- _is_prepared(false)
+ _is_prepared(false),
+ _is_new_gemm_reshaped(false)
{
}
@@ -106,29 +118,52 @@
const GPUTarget gpu_target = CLScheduler::get().target();
// Set the target for the kernels
- _interleave_kernel.set_target(gpu_target);
+ _reshape_lhs_kernel.set_target(gpu_target);
_mm_kernel.set_target(gpu_target);
// Arguments used by GEMMReshapeInfo
// If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
// in order to know how the matrices have been reshaped
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
- const int n = b->info()->dimension(0);
- const int k = a->info()->dimension(0);
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- int mult_transpose1xW_width = 1;
- int mult_interleave4x4_height = 1;
+ DataType data_type = a->info()->data_type();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+ const unsigned int n = b->info()->dimension(0);
+ const unsigned int k = a->info()->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ int mult_transpose1xW_width = 1;
+ int mult_interleave4x4_height = 1;
if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
{
mult_transpose1xW_width = 4;
mult_interleave4x4_height = 2;
}
+ GEMMRHSMatrixInfo rhs_info;
+ rhs_info.n0 = 16 / b->info()->element_size();
+ rhs_info.k0 = 1;
+ rhs_info.h0 = mult_transpose1xW_width;
+ rhs_info.interleave = false;
+ rhs_info.transpose = false;
+
+ GEMMLHSMatrixInfo lhs_info;
+ lhs_info.m0 = 4;
+ lhs_info.k0 = 4;
+ lhs_info.v0 = mult_interleave4x4_height;
+ lhs_info.interleave = true;
+ lhs_info.transpose = true;
// Check if we need to reshape the matrix A and matrix B
_is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
+ // Check if we can run the new reshaped GEMM
+ const auto workload = static_cast<float>((m * n) / 20.0f);
+ _is_new_gemm_reshaped = (workload > 1600.0f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && _is_interleaved_transposed && (data_type == DataType::F32);
+
+ const bool add_matrix_c = (beta != 0.f && c != nullptr);
+ const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
+ const bool use_fused_add = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !_is_new_gemm_reshaped;
+
// if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
if(_is_interleaved_transposed)
{
@@ -145,19 +180,37 @@
}
// _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
- // Configure interleave kernel
- _interleave_kernel.configure(a, &_tmp_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d());
+ if(_is_new_gemm_reshaped)
+ {
+ GEMMLHSMatrixInfo lhs_info;
- // Configure transpose kernel
- _transpose_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
+
+ _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+ _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+ // Configure and tune matrix multiply kernel
+ _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
+ depth_output_gemm3d, reinterpret_input_as_3d));
+ }
+ else
+ {
+ // Configure interleave kernel
+ _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+ // Configure transpose kernel
+ _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+ }
}
- // Configure and tune matrix multiply kernel
- _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
- mult_transpose1xW_width, mult_interleave4x4_height,
- depth_output_gemm3d, reinterpret_input_as_3d),
- gemm_info.fp_mixed_precision());
- CLScheduler::get().tune_kernel_static(_mm_kernel);
+ if(!_is_new_gemm_reshaped)
+ {
+ // Configure and tune matrix multiply kernel
+ _mm_kernel.configure(matrix_a, matrix_b, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta, _is_interleaved_transposed,
+ GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d),
+ gemm_info.fp_mixed_precision());
+ CLScheduler::get().tune_kernel_static(_mm_kernel);
+ }
if(_is_interleaved_transposed)
{
@@ -170,7 +223,7 @@
}
// Configure matrix addition kernel
- if(beta != 0 && c != nullptr)
+ if(add_matrix_c && !use_fused_add)
{
_ma_kernel.configure(c, output, beta);
_run_addition = true;
@@ -197,13 +250,15 @@
// Arguments used by GEMMReshapeInfo
// If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
// in order to know how the matrices have been reshaped
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
- const int n = b->dimension(0);
- const int k = a->dimension(0);
- int mult_transpose1xW_width = 1;
- int mult_interleave4x4_height = 1;
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ DataType data_type = a->data_type();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+ int mult_transpose1xW_width = 1;
+ int mult_interleave4x4_height = 1;
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
{
@@ -211,9 +266,31 @@
mult_interleave4x4_height = 2;
}
+ GEMMRHSMatrixInfo rhs_info;
+ rhs_info.n0 = 16 / b->element_size();
+ rhs_info.k0 = 1;
+ rhs_info.h0 = mult_transpose1xW_width;
+ rhs_info.interleave = false;
+ rhs_info.transpose = false;
+
+ GEMMLHSMatrixInfo lhs_info;
+ lhs_info.m0 = 4;
+ lhs_info.k0 = 4;
+ lhs_info.v0 = mult_interleave4x4_height;
+ lhs_info.interleave = true;
+ lhs_info.transpose = true;
+
// Check if we need to reshape the matrix A and matrix B
const bool run_interleave_transpose = is_interleaved_transposed(m, n, k, a->data_type(), reshape_b_only_on_first_run, gpu_target);
+ // Check if we can run the new reshaped GEMM
+ const auto workload = static_cast<float>((m * n) / 20.0f);
+ const bool is_new_gemm_reshaped = (workload > 1600.f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && run_interleave_transpose && (data_type == DataType::F32);
+
+ const bool add_matrix_c = (beta != 0.f && c != nullptr);
+ const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
+ const bool use_fused_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1) && !is_new_gemm_reshaped;
+
// if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
if(run_interleave_transpose)
{
@@ -227,19 +304,42 @@
matrix_a_info = &tmp_a_info;
matrix_b_info = &tmp_b_info;
- // Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()));
+ if(is_new_gemm_reshaped)
+ {
+ GEMMLHSMatrixInfo lhs_info;
- // Validate transpose kernel
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width));
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
+
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
+ depth_output_gemm3d, reinterpret_input_as_3d)));
+ }
+ else
+ {
+ // Validate interleave kernel
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+ // Validate transpose kernel
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+ }
}
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+ if(!is_new_gemm_reshaped)
+ {
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta,
+ run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+ }
- if(beta != 0 && c != nullptr)
+ if(add_matrix_c && !use_fused_add)
{
// Validate matrix addition kernel
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
@@ -257,17 +357,24 @@
if(_is_interleaved_transposed)
{
// Run interleave kernel
- CLScheduler::get().enqueue(_interleave_kernel, false);
+ CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
if(!_reshape_b_only_on_first_run)
{
// Run transpose kernel
- CLScheduler::get().enqueue(_transpose_kernel, false);
+ CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
}
}
// Run matrix multiply kernel
- CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+ if(_is_new_gemm_reshaped)
+ {
+ CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
+ }
+ else
+ {
+ CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+ }
// Run matrix addition kernel
if(_run_addition)
@@ -286,10 +393,11 @@
{
// Run transpose kernel and mark original weights tensor as unused
_tmp_b.allocator()->allocate();
- CLScheduler::get().enqueue(_transpose_kernel, false);
+ CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
_original_b->mark_as_unused();
}
CLScheduler::get().queue().finish();
_is_prepared = true;
}
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 4694aa7..7105e85 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -93,7 +93,7 @@
CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _add_bias_kernel(),
_original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _data_layout(DataLayout::NCHW), _append_bias(false), _skip_im2col(false), _skip_col2im(false), _is_quantized(false),
- _is_activationlayer_enabled(false), _is_prepared(false)
+ _is_activationlayer_enabled(false), _is_prepared(false), _run_addition(true)
{
}
@@ -101,7 +101,8 @@
int gemm_3d_depth)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
- ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col,
+ _run_addition));
const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
@@ -125,13 +126,15 @@
}
else
{
+ // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run
+ const bool skip_bias_in_gemm = _run_addition || !_skip_im2col;
// Configure matrix multiply function
- _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
+ _mm_gemm.configure(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info);
}
}
Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col)
+ const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, bool run_addition)
{
const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
@@ -156,8 +159,10 @@
}
else
{
+ // Bias does not need to be added in GEMM if im2col is being used or the Matrix Addition kernel needs to be run
+ const bool skip_bias_in_gemm = run_addition || !skip_im2col;
// Perform validation step on Matrix multiply function
- return CLGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
+ return CLGEMM::validate(input, weights, (skip_bias_in_gemm) ? nullptr : biases, output, 1.0f, 1.0f, gemm_info);
}
}
@@ -193,6 +198,8 @@
_skip_col2im = data_layout == DataLayout::NHWC;
_append_bias = (biases != nullptr) && (!_is_quantized);
_is_activationlayer_enabled = act_info.enabled();
+ // In case of F16, fused bias will be used in GEMM
+ _run_addition = (_skip_im2col) && (_append_bias) && (data_type != DataType::F16);
// Set the GPU target for im2col and col2im
_im2col_kernel.set_target(CLScheduler::get().target());
@@ -242,7 +249,7 @@
else if(_append_bias)
{
// Configure add bias kernel
- _add_bias_kernel.configure(output, biases, output, ConvertPolicy::SATURATE);
+ _add_bias_kernel.configure(ArithmeticOperation::ADD, output, biases, output, ConvertPolicy::SATURATE);
}
// Create GEMM output tensor
@@ -276,9 +283,9 @@
{
const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
- const float multiplier = (input->info()->quantization_info().scale * weights->info()->quantization_info().scale) / output_quant_info.scale;
- int output_multiplier = 0;
- int output_shift = 0;
+ const float multiplier = (input->info()->quantization_info().scale * weights->info()->quantization_info().scale) / output_quant_info.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
int min_activation = 0;
@@ -375,6 +382,8 @@
const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
const bool skip_col2im = data_layout == DataLayout::NHWC;
bool is_activationlayer_enabled = act_info.enabled();
+ // In case of F16, fused bias will be used in GEMM
+ const bool run_addition = (skip_im2col) && (append_bias) && (data_type != DataType::F16);
ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
@@ -429,10 +438,10 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation, num_groups));
gemm_input_to_use = &im2col_reshaped_info;
}
- else if(append_bias)
+ else if(run_addition)
{
// Validate add bias kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(output, biases, output, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, output, biases, output, ConvertPolicy::SATURATE));
}
// Create GEMM output tensor
@@ -459,9 +468,9 @@
{
const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input->quantization_info() : output->quantization_info();
- const float multiplier = (input->quantization_info().scale * weights->quantization_info().scale) / output_quant_info.scale;
- int output_multiplier = 0;
- int output_shift = 0;
+ const float multiplier = (input->quantization_info().scale * weights->quantization_info().scale) / output_quant_info.scale;
+ int output_multiplier = 0;
+ int output_shift = 0;
ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
@@ -496,7 +505,7 @@
// In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, run_addition));
// Validate Col2Im
if(!skip_col2im)
@@ -537,7 +546,7 @@
_mm_gemm.run();
}
- if(_skip_im2col && _append_bias)
+ if(_run_addition)
{
CLScheduler::get().enqueue(_add_bias_kernel);
}
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 2d4d231..2a01db7 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,42 +31,25 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
-using namespace arm_compute;
+namespace arm_compute
+{
using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_gemm;
namespace
{
-inline bool is_interleaved_transposed(int m, int n, int k, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+inline bool is_gemm_reshaped(unsigned int m, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
{
- bool flag = true;
-
- if(gpu_target_is_in(gpu_target,
- GPUTarget::G71, GPUTarget::G72,
- GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT))
- {
- // COMPMID-852
- if(k > 256 && m > 4 && reshape_b_only_on_first_run)
- {
- flag = ((0.72f + n * 0.10766f) < (n * 0.1284f));
- }
- else
- {
- flag = false;
- }
- }
- else
- {
- flag = m > 1;
- }
-
- return flag;
+ return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (m > 1) && (reshape_b_only_on_first_run);
}
} // namespace
CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)),
_mm_kernel(),
+ _mm_reshaped_kernel(),
_mtx_a_reshape_kernel(),
_mtx_b_reshape_kernel(),
_mtx_a_reduction_kernel(),
@@ -81,7 +64,7 @@
_original_b(nullptr),
_a_offset(0),
_b_offset(0),
- _is_interleaved_transposed(true),
+ _is_gemm_reshaped(true),
_reshape_b_only_on_first_run(false),
_is_prepared(false),
_fuse_output_stage(false)
@@ -108,23 +91,23 @@
const ICLTensor *matrix_a = a;
const ICLTensor *matrix_b = b;
+ GEMMRHSMatrixInfo rhs_info;
+ GEMMLHSMatrixInfo lhs_info;
// Arguments used by GEMMReshapeInfo
// If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
// in order to know how the matrices have been reshaped
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const bool unroll_block = dot8_supported(CLKernelLibrary::get().get_device());
- const int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
- const int n = b->info()->dimension(0);
- const int k = a->info()->dimension(0);
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- constexpr int mult_transpose1xW_width = 1;
- constexpr int mult_interleave4x4_height = 1;
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+ const unsigned int n = b->info()->dimension(0);
+ const unsigned int k = a->info()->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
// Check if we need to reshape the matrix A and matrix B
- _is_interleaved_transposed = is_interleaved_transposed(m, n, k, _reshape_b_only_on_first_run, gpu_target);
+ _is_gemm_reshaped = is_gemm_reshaped(m, _reshape_b_only_on_first_run, gpu_target);
- if(_is_interleaved_transposed)
+ if(_is_gemm_reshaped)
{
// if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
reinterpret_input_as_3d = false;
@@ -138,11 +121,14 @@
_memory_group.manage(&_tmp_b);
}
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+
// Configure interleave kernel
- _mtx_a_reshape_kernel.configure(a, &_tmp_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d(), unroll_block);
+ _mtx_a_reshape_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
// Configure transpose kernel
- _mtx_b_reshape_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
+ _mtx_b_reshape_kernel.configure(b, &_tmp_b, rhs_info);
}
// Initialize matrix B reduction kernel only if _a_offset is not equal to 0
@@ -177,10 +163,16 @@
_memory_group.manage(&_mm_result_s32);
- // Configure matrix multiply kernel
- _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
- mult_transpose1xW_width, mult_interleave4x4_height,
- depth_output_gemm3d, reinterpret_input_as_3d));
+ if(_is_gemm_reshaped)
+ {
+ // Configure and tune matrix multiply kernel
+ _mm_reshaped_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
+ else
+ {
+ // Configure matrix multiply kernel
+ _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
// Configure offset contribution kernel
_offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
@@ -190,17 +182,23 @@
}
else
{
- // Configure matrix multiply kernel
- _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
- mult_transpose1xW_width, mult_interleave4x4_height,
- depth_output_gemm3d, reinterpret_input_as_3d));
+ if(_is_gemm_reshaped)
+ {
+ // Configure and tune matrix multiply kernel
+ _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
+ else
+ {
+ // Configure matrix multiply kernel
+ _mm_kernel.configure(matrix_a, matrix_b, output, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
// Configure offset contribution kernel
_offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, _b_offset);
}
// Allocate tensors
- if(_is_interleaved_transposed)
+ if(_is_gemm_reshaped)
{
_tmp_a.allocator()->allocate();
if(!_reshape_b_only_on_first_run)
@@ -233,18 +231,19 @@
const ITensorInfo *matrix_a_info = a;
const ITensorInfo *matrix_b_info = b;
- TensorInfo tmp_a_info{};
- TensorInfo tmp_b_info{};
+ TensorInfo tmp_a_info{};
+ TensorInfo tmp_b_info{};
+ GEMMRHSMatrixInfo rhs_info;
+ GEMMLHSMatrixInfo lhs_info;
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- const int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
- const int n = b->dimension(0);
- const int k = a->dimension(0);
- constexpr int mult_transpose1xW_width = 1;
- constexpr int mult_interleave4x4_height = 1;
- const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+ const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+ const unsigned int n = b->dimension(0);
+ const unsigned int k = a->dimension(0);
+ const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+ const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- bool reshape_matrices = is_interleaved_transposed(m, n, k, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
+ bool reshape_matrices = is_gemm_reshaped(m, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
// if reshape_matrices is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
if(reshape_matrices)
@@ -252,20 +251,24 @@
reinterpret_input_as_3d = false;
}
- const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d);
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
if(reshape_matrices)
{
matrix_a_info = &tmp_a_info;
matrix_b_info = &tmp_b_info;
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+
// Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()));
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
// Validate transpose kernel
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width));
+
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
}
TensorInfo info_vector_sum_col, info_vector_sum_row;
@@ -292,12 +295,22 @@
{
TensorInfo mm_result_s32_info{};
- // Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_matrices, reshape_info)).set_data_type(DataType::S32));
+ if(reshape_matrices)
+ {
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_matrices, reshape_info));
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+ }
+ else
+ {
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, false, reshape_info));
+ }
// Validate offset contribution kernel
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
a_offset == 0 ? nullptr : &info_vector_sum_col,
@@ -309,9 +322,16 @@
}
else
{
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_matrices, reshape_info));
-
+ if(reshape_matrices)
+ {
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+ }
+ else
+ {
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, false, reshape_info));
+ }
// Validate offset contribution kernel
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
a_offset == 0 ? nullptr : &info_vector_sum_col,
@@ -329,7 +349,7 @@
_memory_group.acquire();
- if(_is_interleaved_transposed)
+ if(_is_gemm_reshaped)
{
// Run reshape matrix A
CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
@@ -348,7 +368,14 @@
}
// Run matrix multiply
- CLScheduler::get().enqueue(_mm_kernel, false);
+ if(_is_gemm_reshaped)
+ {
+ CLScheduler::get().enqueue(_mm_reshaped_kernel, false);
+ }
+ else
+ {
+ CLScheduler::get().enqueue(_mm_kernel, false);
+ }
// Run matrix A reduction kernel only if _b_offset is not equal to 0
if(_b_offset != 0)
@@ -374,7 +401,7 @@
{
if(!_is_prepared)
{
- if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+ if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
{
ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
@@ -395,3 +422,4 @@
_is_prepared = true;
}
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp
new file mode 100644
index 0000000..459438e
--- /dev/null
+++ b/src/runtime/CL/functions/CLGather.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGather.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLGather::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLGatherKernel>();
+ k->configure(input, indices, output, axis);
+ _kernel = std::move(k);
+}
+
+Status CLGather::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+{
+ return CLGatherKernel::validate(input, indices, output, axis);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index 5dd1202..c50132e 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,6 +40,7 @@
_memset_kernel(),
_padded_copy_kernel(),
_cpp_nms_kernel(),
+ _is_nhwc(false),
_deltas_permuted(),
_deltas_flattened(),
_scores_permuted(),
@@ -60,10 +61,11 @@
ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+ _is_nhwc = scores->info()->data_layout() == DataLayout::NHWC;
const DataType data_type = deltas->info()->data_type();
- const int num_anchors = scores->info()->dimension(2);
- const int feat_width = scores->info()->dimension(0);
- const int feat_height = scores->info()->dimension(1);
+ const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+ const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+ const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
const int total_num_anchors = num_anchors * feat_width * feat_height;
const int pre_nms_topN = info.pre_nms_topN();
const int post_nms_topN = info.post_nms_topN();
@@ -77,21 +79,37 @@
_deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, data_type));
// Permute and reshape deltas
- _memory_group.manage(&_deltas_permuted);
- _memory_group.manage(&_deltas_flattened);
- _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
- _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
- _deltas_permuted.allocator()->allocate();
+ if(!_is_nhwc)
+ {
+ _memory_group.manage(&_deltas_permuted);
+ _memory_group.manage(&_deltas_flattened);
+ _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+ _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
+ _deltas_permuted.allocator()->allocate();
+ }
+ else
+ {
+ _memory_group.manage(&_deltas_flattened);
+ _flatten_deltas_kernel.configure(deltas, &_deltas_flattened);
+ }
const TensorShape flatten_shape_scores(1, total_num_anchors);
_scores_flattened.allocator()->init(TensorInfo(flatten_shape_scores, 1, data_type));
// Permute and reshape scores
- _memory_group.manage(&_scores_permuted);
- _memory_group.manage(&_scores_flattened);
- _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
- _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
- _scores_permuted.allocator()->allocate();
+ if(!_is_nhwc)
+ {
+ _memory_group.manage(&_scores_permuted);
+ _memory_group.manage(&_scores_flattened);
+ _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+ _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
+ _scores_permuted.allocator()->allocate();
+ }
+ else
+ {
+ _memory_group.manage(&_scores_flattened);
+ _flatten_scores_kernel.configure(scores, &_scores_flattened);
+ }
// Bounding box transform
_memory_group.manage(&_all_proposals);
@@ -141,11 +159,12 @@
const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(scores, DataLayout::NCHW);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(scores, DataLayout::NCHW, DataLayout::NHWC);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
- const int num_anchors = scores->dimension(2);
- const int feat_width = scores->dimension(0);
- const int feat_height = scores->dimension(1);
+ const int num_anchors = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+ const int feat_width = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+ const int feat_height = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
const int num_images = scores->dimension(3);
const int total_num_anchors = num_anchors * feat_width * feat_height;
const int values_per_roi = info.values_per_roi();
@@ -156,14 +175,21 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
- ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
+ TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+ if(scores->data_layout() == DataLayout::NHWC)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+ }
TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info));
- TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
- ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
-
TensorInfo scores_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
@@ -236,9 +262,12 @@
CLScheduler::get().enqueue(_compute_anchors_kernel, false);
// Transpose and reshape the inputs
- CLScheduler::get().enqueue(_permute_deltas_kernel, false);
+ if(!_is_nhwc)
+ {
+ CLScheduler::get().enqueue(_permute_deltas_kernel, false);
+ CLScheduler::get().enqueue(_permute_scores_kernel, false);
+ }
CLScheduler::get().enqueue(_flatten_deltas_kernel, false);
- CLScheduler::get().enqueue(_permute_scores_kernel, false);
CLScheduler::get().enqueue(_flatten_scores_kernel, false);
// Build the boxes
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 4f709d5..2e3c6d7 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -32,8 +32,8 @@
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
{
@@ -81,3 +81,4 @@
_memory_group.release();
}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index a89c4e3..f01b1b8 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -110,9 +110,9 @@
_gemm_forget_gate.configure(output_state_in, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
_forget_gate_out2.allocator()->allocate();
_memory_group.manage(&_forget_gate_out5);
- _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
+ _accum_forget_gate1.configure(ArithmeticOperation::ADD, &_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
+ _forget_gate_out1.allocator()->allocate();
CLTensor *forget_gate_out = &_forget_gate_out5;
-
if(lstm_params.has_peephole_opt())
{
_forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
@@ -129,17 +129,18 @@
{
_forget_gate_out3.allocator()->allocate();
}
- _activation_forget_gate.configure(forget_gate_out, &_forget_gate_out1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
// Configure block that calculates the input gate
// input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
// input_gate = 1 - forget_gate, with CIFG
_input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ CLTensor *input_gate_out = &_input_gate_out1;
if(lstm_params.has_cifg_opt())
{
_memory_group.manage(&_input_gate_out1);
_ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _subtract_input_gate.configure(&_ones, &_forget_gate_out1, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _subtract_input_gate.configure(ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
_ones.allocator()->allocate();
_run_cifg_opt = true;
}
@@ -160,17 +161,23 @@
_gemm_input_gate.configure(output_state_in, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
_input_gate_out2.allocator()->allocate();
_memory_group.manage(&_input_gate_out4);
- _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
+ _accum_input_gate1.configure(ArithmeticOperation::ADD, &_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
+ _input_gate_out3.allocator()->allocate();
+ input_gate_out = &_input_gate_out4;
if(_run_peephole_opt)
{
_memory_group.manage(&_input_gate_out5);
_pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
_accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _input_gate_out4.allocator()->allocate();
_input_gate_out5.allocator()->allocate();
+ input_gate_out = &_input_gate_out1;
}
- _input_gate_out3.allocator()->allocate();
- _input_gate_out4.allocator()->allocate();
- _activation_input_gate.configure(&_input_gate_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ else
+ {
+ _input_gate_out1.allocator()->allocate();
+ }
+ _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
}
// Configure block that calculates the cell state
@@ -190,14 +197,13 @@
_gemm_cell_state1.configure(output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
_cell_state_out2.allocator()->allocate();
_memory_group.manage(&_cell_state_out4);
- _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+ _accum_cell_state1.configure(ArithmeticOperation::ADD, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
_activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
_memory_group.manage(&_cell_state_out5);
- _pixelwise_mul_cell_state1.configure(&_cell_state_out4, &_input_gate_out1, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _pixelwise_mul_cell_state1.configure(&_cell_state_out4, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
_cell_state_out4.allocator()->allocate();
- _pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
- _forget_gate_out1.allocator()->allocate();
- _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+ _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+ _accum_cell_state2.configure(ArithmeticOperation::ADD, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
_cell_state_out3.allocator()->allocate();
_cell_state_out5.allocator()->allocate();
// Perform clipping
@@ -223,7 +229,7 @@
_gemm_output.configure(output_state_in, &_output2, nullptr, &_output3, 1.f, 0.f);
_output2.allocator()->allocate();
_memory_group.manage(&_output5);
- _accum_output1.configure(&_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
+ _accum_output1.configure(ArithmeticOperation::ADD, &_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
_output3.allocator()->allocate();
CLTensor *output_gate_out = &_output5;
if(lstm_params.has_peephole_opt())
@@ -284,13 +290,13 @@
std::vector<ICLTensor *> scratch_inputs;
if(!lstm_params.has_cifg_opt())
{
- scratch_inputs.emplace_back(&_input_gate_out1);
+ scratch_inputs.emplace_back(input_gate_out);
}
scratch_inputs.emplace_back(&_cell_state_out1);
scratch_inputs.emplace_back(forget_gate_out);
scratch_inputs.emplace_back(output_gate_out);
_concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
- _input_gate_out1.allocator()->allocate();
+ input_gate_out->allocator()->allocate();
_cell_state_out1.allocator()->allocate();
forget_gate_out->allocator()->allocate();
output_gate_out->allocator()->allocate();
@@ -364,7 +370,7 @@
// Validate forget gate
ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &forget_gate, 1.f, 0.f, GEMMInfo()));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
if(lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
@@ -396,7 +402,7 @@
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtractionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
}
// Validate cell state
@@ -544,4 +550,4 @@
_concat_scratch_buffer.run();
_memory_group.release();
-}
\ No newline at end of file
+}
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index 7e5278f..559b57f 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,8 +28,8 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
#include "support/ToolchainSupport.h"
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index 32d8f15..8489fab 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,7 +45,7 @@
_norm_kernel.configure(input, output, norm_info);
// Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
- _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue());
}
Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index de43c7d..3aa1b1e 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,21 +34,21 @@
{
}
-void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding)
+void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value)
{
// Copy the input to the output
_copy_kernel.configure(input, output, padding);
// Set the pages of the output to zero
- _memset_kernel.configure(output, PixelValue());
+ _memset_kernel.configure(output, constant_value);
// Fill padding on the first two dimensions with zeros
- _fillborder_kernel.configure(input, input->info()->padding(), BorderMode::CONSTANT);
+ _fillborder_kernel.configure(input, input->info()->padding(), BorderMode::CONSTANT, constant_value);
}
-Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding)
+Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(input, PixelValue()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(input, constant_value));
ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, padding));
return Status{};
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 1809e6e..63f00ac 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -60,7 +60,7 @@
ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
- ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
return Status{};
@@ -90,7 +90,7 @@
_add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
_memory_group.manage(&_add_output);
- _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
+ _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
_fully_connected_out.allocator()->allocate();
_gemm_output.allocator()->allocate();
@@ -127,4 +127,4 @@
_is_prepared = true;
}
-}
\ No newline at end of file
+}
diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
index 0f480ee..7bb4178 100644
--- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,7 +30,7 @@
using namespace arm_compute;
-void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
{
// Configure ROI pooling kernel
auto k = arm_compute::support::cpp14::make_unique<CLROIPoolingLayerKernel>();
diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp
new file mode 100644
index 0000000..b2cd472
--- /dev/null
+++ b/src/runtime/CL/functions/CLRange.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLRange.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLRangeKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLRange::configure(ICLTensor *output, const float start, const float end, const float step)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLRangeKernel>();
+ k->set_target(CLScheduler::get().target());
+ k->configure(output, start, end, step);
+ _kernel = std::move(k);
+
+ // Tune kernels
+ CLScheduler::get().tune_kernel_static(*_kernel);
+}
+
+Status CLRange::validate(const ITensorInfo *output, const float start, const float end, const float step)
+{
+ return CLRangeKernel::validate(output, start, end, step);
+}
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index 1016ff7..b2d0f81 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,22 +45,31 @@
_reduced_outs = arm_compute::support::cpp14::make_unique<CLTensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
_keep_dims = keep_dims;
+ Coordinates axis_local = reduction_axis;
+ const int input_dims = input->info()->num_dimensions();
+
+ // Convert negative axis
+ for(unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ axis_local[i] = wrap_around(axis_local[i], input_dims);
+ }
+
// Perform reduction for every axis
for(unsigned int i = 0; i < _reduction_ops; ++i)
{
TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
- out_shape.set(reduction_axis[i], 1);
+ out_shape.set(axis_local[i], 1);
auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
if(i == _reduction_ops - 1 && keep_dims)
{
- _reduction_kernels[i].configure(in, output, reduction_axis[i], ReductionOperation::MEAN_SUM);
+ _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
}
else
{
_reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
_memory_group.manage(_reduced_outs.get() + i);
- _reduction_kernels[i].configure(in, _reduced_outs.get() + i, reduction_axis[i], ReductionOperation::MEAN_SUM);
+ _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], ReductionOperation::MEAN_SUM);
}
}
@@ -77,11 +86,10 @@
// We have to sort the reduction axis vectors in order for remove_dimension
// to work properly
- Coordinates axis_copy = reduction_axis;
- std::sort(axis_copy.begin(), axis_copy.begin() + _reduction_ops);
+ std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
for(unsigned int i = 0; i < _reduction_ops; ++i)
{
- out_shape.remove_dimension(axis_copy[i] - i);
+ out_shape.remove_dimension(axis_local[i] - i);
}
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
_reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
@@ -90,22 +98,43 @@
Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
{
- ARM_COMPUTE_UNUSED(keep_dims);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
- for(unsigned int i = 0; i < reduction_axis.num_dimensions(); ++i)
+ TensorShape out_shape = input->tensor_shape();
+
+ Coordinates axis_sorted = reduction_axis;
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+ const int input_dims = input->num_dimensions();
+
+ // Convert negative axis
+ for(unsigned int i = 0; i < reduction_ops; ++i)
{
- ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis[i] > 3);
- ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(reduction_axis[i]) > input->num_dimensions() - 1);
+ axis_sorted[i] = wrap_around(axis_sorted[i], input_dims);
+ }
+
+ std::sort(axis_sorted.begin(), axis_sorted.begin() + reduction_ops);
+ for(unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_sorted[i] > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_sorted[i]) > input->num_dimensions() - 1);
if(output->total_size() > 0 && keep_dims)
{
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(reduction_axis[i]) != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_sorted[i]) != 1);
}
-
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, output, reduction_axis[i], ReductionOperation::MEAN_SUM));
+ if(keep_dims)
+ {
+ out_shape.set(axis_sorted[i], 1);
+ }
+ else
+ {
+ out_shape.remove_dimension(axis_sorted[i] - i);
+ }
}
+ const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
return Status{};
}
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index c5447ff..3d82e3f 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -56,15 +56,19 @@
} // namespace
CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_quantized()
+ : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_serial()
{
}
Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
-
- if(axis == 0 && !is_data_type_quantized(input->data_type()))
+ bool is_serial = is_data_type_quantized(input->data_type()) || axis != 0;
+ if(is_serial)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
+ }
+ else
{
// Create temporary tensor infos
auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
@@ -81,17 +85,25 @@
}
ReductionOperation first_kernel_op;
+ ReductionOperation intermediate_kernel_op;
ReductionOperation last_kernel_op;
switch(op)
{
case ReductionOperation::SUM:
case ReductionOperation::MEAN_SUM:
- first_kernel_op = ReductionOperation::SUM;
- last_kernel_op = op;
+ first_kernel_op = ReductionOperation::SUM;
+ intermediate_kernel_op = ReductionOperation::SUM;
+ last_kernel_op = op;
break;
case ReductionOperation::SUM_SQUARE:
- first_kernel_op = ReductionOperation::SUM_SQUARE;
- last_kernel_op = ReductionOperation::SUM;
+ first_kernel_op = ReductionOperation::SUM_SQUARE;
+ intermediate_kernel_op = ReductionOperation::SUM;
+ last_kernel_op = ReductionOperation::SUM;
+ break;
+ case ReductionOperation::PROD:
+ first_kernel_op = ReductionOperation::PROD;
+ intermediate_kernel_op = ReductionOperation::PROD;
+ last_kernel_op = ReductionOperation::PROD;
break;
default:
ARM_COMPUTE_ERROR("Not supported");
@@ -103,17 +115,13 @@
// Validate ReductionOperation on intermediate stages
for(unsigned int i = 1; i < num_of_stages - 1; ++i)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, ReductionOperation::SUM));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, intermediate_kernel_op));
}
// Validate ReductionOperation on the last stage
const unsigned int last_stage = num_of_stages - 1;
ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input->dimension(0)));
}
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
- }
return Status{};
}
@@ -122,65 +130,77 @@
{
_num_of_stages = calculate_number_of_stages(input->info(), axis);
_reduction_axis = axis;
- _is_quantized = is_data_type_quantized(input->info()->data_type());
+ _is_serial = is_data_type_quantized(input->info()->data_type()) || axis != 0;
// Configure reduction operation kernels
_reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
// Create temporary tensors
- if(axis == 0 && !_is_quantized)
+ if(_is_serial)
+ {
+ _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
+ }
+ else
{
_border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
- _sums_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+ _results_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
TensorShape shape{ input->info()->tensor_shape() };
for(unsigned int i = 0; i < _num_of_stages - 1; i++)
{
shape.set(0, ceil(shape.x() / 128.f));
- _sums_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
+ _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
}
// Apply ReductionOperation only on first kernel
- _memory_group.manage(_sums_vector.get());
+ _memory_group.manage(_results_vector.get());
ReductionOperation first_kernel_op;
+ ReductionOperation intermediate_kernel_op;
ReductionOperation last_kernel_op;
+ PixelValue pixelValue;
switch(op)
{
case ReductionOperation::SUM:
case ReductionOperation::MEAN_SUM:
- first_kernel_op = ReductionOperation::SUM;
- last_kernel_op = op;
+ first_kernel_op = ReductionOperation::SUM;
+ intermediate_kernel_op = ReductionOperation::SUM;
+ last_kernel_op = op;
+ pixelValue = PixelValue();
break;
case ReductionOperation::SUM_SQUARE:
- first_kernel_op = ReductionOperation::SUM_SQUARE;
- last_kernel_op = ReductionOperation::SUM;
+ first_kernel_op = ReductionOperation::SUM_SQUARE;
+ intermediate_kernel_op = ReductionOperation::SUM;
+ last_kernel_op = ReductionOperation::SUM;
+ pixelValue = PixelValue();
+ break;
+ case ReductionOperation::PROD:
+ first_kernel_op = ReductionOperation::PROD;
+ intermediate_kernel_op = ReductionOperation::PROD;
+ last_kernel_op = ReductionOperation::PROD;
+ pixelValue = PixelValue(1, input->info()->data_type());
break;
default:
ARM_COMPUTE_ERROR("Not supported");
}
- _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, first_kernel_op);
- _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _reduction_kernels_vector[0].configure(input, _results_vector.get(), axis, first_kernel_op);
+ _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
// Apply ReductionOperation on intermediate stages
for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
{
- _memory_group.manage(_sums_vector.get() + i);
- _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM);
- _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
- _sums_vector[i - 1].allocator()->allocate();
+ _memory_group.manage(_results_vector.get() + i);
+ _reduction_kernels_vector[i].configure(_results_vector.get() + i - 1, _results_vector.get() + i, axis, intermediate_kernel_op);
+ _border_handlers_vector[i].configure(_results_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
+ _results_vector[i - 1].allocator()->allocate();
}
// Apply ReductionOperation on the last stage
const unsigned int last_stage = _num_of_stages - 1;
const unsigned int input_width = input->info()->dimension(0);
- _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
- _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0));
- _sums_vector[last_stage - 1].allocator()->allocate();
- }
- else
- {
- _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
+ _reduction_kernels_vector[last_stage].configure(_results_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
+ _border_handlers_vector[last_stage].configure(_results_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
+ _results_vector[last_stage - 1].allocator()->allocate();
}
}
@@ -188,7 +208,11 @@
{
_memory_group.acquire();
- if(_reduction_axis == 0 && !_is_quantized)
+ if(_is_serial)
+ {
+ CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
+ }
+ else
{
for(unsigned int i = 0; i < _num_of_stages; ++i)
{
@@ -196,10 +220,6 @@
CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
}
}
- else
- {
- CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
- }
_memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp
new file mode 100644
index 0000000..0f86b9f
--- /dev/null
+++ b/src/runtime/CL/functions/CLReverse.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLReverse.h"
+
+#include "arm_compute/core/CL/kernels/CLReverseKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLReverseKernel>();
+ k->configure(input, output, axis);
+ _kernel = std::move(k);
+}
+
+Status CLReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+{
+ return CLReverseKernel::validate(input, output, axis);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
new file mode 100644
index 0000000..90c368e
--- /dev/null
+++ b/src/runtime/CL/functions/CLSelect.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSelect.h"
+
+#include "arm_compute/core/CL/kernels/CLSelectKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+void CLSelect::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLSelectKernel>();
+ k->configure(c, x, y, output);
+ _kernel = std::move(k);
+}
+
+Status CLSelect::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+{
+ return CLSelectKernel::validate(c, x, y, output);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
index bef7eca..f630853 100644
--- a/src/runtime/CL/functions/CLSlice.cpp
+++ b/src/runtime/CL/functions/CLSlice.cpp
@@ -36,10 +36,10 @@
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
// Get absolute end coordinates
- const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input->info()->tensor_shape(), ends);
+ const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
- k->configure(input, output, starts, ends_abs, BiStrides(), 0, 0, 0);
+ k->configure(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
_kernel = std::move(k);
}
@@ -54,8 +54,8 @@
}));
// Get absolute end coordinates
- const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input->tensor_shape(), ends);
+ const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
- return CLStridedSliceKernel::validate(input, output, starts, ends_abs, BiStrides(), 0, 0, 0);
+ return CLStridedSliceKernel::validate(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index 76c1e18..a24b72e 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
@@ -33,20 +33,19 @@
namespace arm_compute
{
CLSpaceToBatchLayer::CLSpaceToBatchLayer()
- : _space_to_batch_kernel(), _output(nullptr), _has_padding(false)
+ : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
{
}
void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
{
_has_padding = true;
+ _memset_kernel.configure(output, PixelValue());
}
-
- _output = output;
_space_to_batch_kernel.configure(input, block_shape, paddings, output);
}
@@ -57,42 +56,35 @@
if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
{
_has_padding = true;
+ _memset_kernel.configure(output, PixelValue());
}
-
- _output = output;
_space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
}
Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
{
- return CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
+
+ return Status{};
}
Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
const ITensorInfo *output)
{
- return CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue()));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+
+ return Status{};
}
void CLSpaceToBatchLayer::run()
{
// Zero out output only if we have paddings
- // TODO(micspy01): replace with memset once ready
if(_has_padding)
{
- _output->map(CLScheduler::get().queue(), true);
- if(is_data_type_quantized_asymmetric(_output->info()->data_type()))
- {
- const uint8_t quantized_zero = _output->info()->quantization_info().offset;
- std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
- }
- else
- {
- memset(_output->buffer(), 0, _output->info()->total_size());
- }
- _output->unmap(CLScheduler::get().queue());
+ CLScheduler::get().enqueue(_memset_kernel, true);
}
-
CLScheduler::get().enqueue(_space_to_batch_kernel, true);
}
} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
new file mode 100644
index 0000000..71327fe
--- /dev/null
+++ b/src/runtime/CL/functions/CLStackLayer.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <complex>
+
+#include "arm_compute/runtime/CL/functions/CLStackLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLStackLayer::CLStackLayer() // NOLINT
+ : _input(),
+ _stack_kernels(),
+ _num_inputs(0)
+{
+}
+
+void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
+{
+ _num_inputs = input.size();
+ _stack_kernels = arm_compute::support::cpp14::make_unique<CLStackLayerKernel[]>(_num_inputs);
+
+ // Wrap around negative values
+ const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
+
+ for(unsigned int i = 0; i < _num_inputs; i++)
+ {
+ _stack_kernels[i].configure(input[i], axis_u, i, _num_inputs, output);
+ }
+}
+
+Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input.empty());
+
+ // Wrap around negative values
+ const size_t rank = input[0]->num_dimensions();
+ const unsigned int axis_u = wrap_around(axis, static_cast<int>(rank + 1));
+
+ const unsigned int num_inputs = input.size();
+
+ for(unsigned int i = 0; i < num_inputs; i++)
+ {
+ // All the tensors must have the same rank
+ ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
+ // Validate Kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLStackLayerKernel::validate(input[i], axis_u, i, num_inputs, output));
+ }
+
+ return Status{};
+}
+
+void CLStackLayer::run()
+{
+ for(unsigned i = 0; i < _num_inputs; i++)
+ {
+ CLScheduler::get().enqueue(_stack_kernels[i], false);
+ }
+}
diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp
new file mode 100644
index 0000000..ec6a4ab
--- /dev/null
+++ b/src/runtime/CL/functions/CLTile.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLTile.h"
+
+#include "arm_compute/core/CL/kernels/CLTileKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLTile::configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLTileKernel>();
+ k->configure(input, output, multiples);
+ _kernel = std::move(k);
+}
+
+Status CLTile::validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples)
+{
+ return CLTileKernel::validate(input, output, multiples);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
new file mode 100644
index 0000000..428d091
--- /dev/null
+++ b/src/runtime/CL/functions/CLUnstack.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLUnstack.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
+{
+ return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
+}
+
+inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+{
+ // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
+ Coordinates slice_end;
+ slice_start.set_num_dimensions(input_num_dimensions);
+ slice_end.set_num_dimensions(input_num_dimensions);
+ for(size_t k = 0; k < input_num_dimensions; ++k)
+ {
+ slice_start.set(k, 0);
+ slice_end.set(k, -1);
+ }
+ slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(slice_end);
+}
+} // namespace
+
+CLUnstack::CLUnstack() // NOLINT
+ : _num_slices(0),
+ _strided_slice_vector()
+{
+}
+
+void CLUnstack::configure(const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis)
+{
+ std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
+ std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+ return t->info();
+ });
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_ERROR_THROW_ON(CLUnstack::validate(input->info(), outputs_vector_info, axis));
+
+ // Wrap around negative values
+ const unsigned int axis_u = wrap_axis(axis, input->info());
+ _num_slices = std::min(outputs_vector_info.size(), input->info()->dimension(axis_u));
+ _strided_slice_vector = arm_compute::support::cpp14::make_unique<CLStridedSlice[]>(_num_slices);
+
+ Coordinates slice_start;
+ int32_t slice_end_mask;
+ setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
+ for(unsigned int slice = 0; slice < _num_slices; ++slice)
+ {
+ // Adjusts start and end coordinates to take a 2D slice at a time
+ slice_start.set(axis_u, slice);
+ _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+ }
+}
+
+Status CLUnstack::validate(const ITensorInfo *input, const std::vector<ITensorInfo *> &output_vector, int axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_vector.empty());
+ ARM_COMPUTE_RETURN_ERROR_ON(axis < (-static_cast<int>(input->tensor_shape().num_dimensions())));
+ ARM_COMPUTE_RETURN_ERROR_ON(axis >= static_cast<int>(input->tensor_shape().num_dimensions()));
+ const unsigned int num_slices = std::min(output_vector.size(), input->dimension(wrap_axis(axis, input)));
+ ARM_COMPUTE_RETURN_ERROR_ON(num_slices > input->dimension(wrap_axis(axis, input)));
+ ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size());
+ Coordinates slice_start;
+ int32_t slice_end_mask;
+ for(size_t k = 0; k < num_slices; ++k)
+ {
+ slice_start.set(wrap_axis(axis, input), k);
+ setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
+ ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+ }
+ return Status{};
+}
+
+void CLUnstack::run()
+{
+ for(unsigned i = 0; i < _num_slices; ++i)
+ {
+ _strided_slice_vector[i].run();
+ }
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
index 46a2d80..d0801a6 100644
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -50,8 +50,8 @@
ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
// Output auto inizialitation if not yet initialized
- TensorInfo tmp_output_info = *output->clone();
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+ TensorInfo tmp_output_info = *output->clone();
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
switch(num_inputs)
@@ -90,7 +90,7 @@
{
inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
}
- TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 1abcb67..069196e 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -84,8 +84,8 @@
} // namespace
CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _activationlayer_function(), _input0(), _input1(), _batched_mm_output(),
- _original_weights(nullptr), _is_prepared(false), _is_activationlayer_enabled(false)
+ : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _input0(), _input1(), _batched_mm_output(), _original_weights(nullptr),
+ _is_prepared(false)
{
}
@@ -133,14 +133,7 @@
(input->info()->data_type() == DataType::F16)));
// Configure output transform
- _output_transform.configure(&_batched_mm_output, biases, output, winograd_info);
-
- // Configure activation layer
- _is_activationlayer_enabled = act_info.enabled();
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.configure(output, nullptr, act_info);
- }
+ _output_transform.configure(&_batched_mm_output, biases, output, winograd_info, act_info);
// Allocate temporary tensors
_input0.allocator()->allocate();
@@ -216,11 +209,6 @@
// Run output transform
CLScheduler::get().enqueue(_output_transform);
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.run();
- }
-
_memory_group.release();
}
diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
index 09e8456..7361eb2 100644
--- a/src/runtime/CL/functions/CLWinogradInputTransform.cpp
+++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,7 +35,7 @@
auto k = arm_compute::support::cpp14::make_unique<CLWinogradInputTransformKernel>();
k->configure(input, output, winograd_info);
_kernel = std::move(k);
- _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
}
Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
diff --git a/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp b/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp
new file mode 100644
index 0000000..cd97849
--- /dev/null
+++ b/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.h"
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+namespace
+{
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_gemm_reshaped(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
+ bool lhs_interleave, bool rhs_interleave)
+{
+ GEMMLHSMatrixInfo lhs_info;
+ GEMMRHSMatrixInfo rhs_info;
+
+ // Configure GEMMLHSMatrixInfo
+ lhs_info.m0 = m0;
+ lhs_info.k0 = k0;
+ lhs_info.v0 = ((m / (lhs_info.m0 * v0)) == 0) ? 1 : v0;
+ lhs_info.interleave = lhs_interleave;
+ lhs_info.transpose = false;
+
+ // Configure GEMMRHSMatrixInfo
+ rhs_info.n0 = n0;
+ rhs_info.k0 = lhs_info.k0;
+ rhs_info.h0 = ((n / (rhs_info.n0 * h0)) == 0) ? 1 : h0;
+ rhs_info.interleave = rhs_interleave;
+ rhs_info.transpose = true;
+
+ return std::make_pair(lhs_info, rhs_info);
+}
+
+} // namespace
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+ ARM_COMPUTE_ERROR_ON(data_type != DataType::F32 && data_type != DataType::QASYMM8);
+ ARM_COMPUTE_UNUSED(data_type);
+
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
+ using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (CLGEMMReshapedConfigurationBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+ // Configurations for Mali-G76
+ static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_reshaped_configs_G76 =
+ {
+ { DataType::F32, &CLGEMMReshapedConfigurationBifrost::configure_G76_f32 },
+ { DataType::QASYMM8, &CLGEMMReshapedConfigurationBifrost::configure_G76_u8 }
+ };
+
+ // Configurations for Mali-G7x
+ static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_reshaped_configs_G7x =
+ {
+ { DataType::F32, &CLGEMMReshapedConfigurationBifrost::configure_G7x_f32 },
+ { DataType::QASYMM8, &CLGEMMReshapedConfigurationBifrost::configure_G7x_u8 }
+ };
+
+ switch(gpu_target)
+ {
+ case GPUTarget::G76:
+ return (this->*gemm_reshaped_configs_G76[data_type])(m, n, k, b);
+ default:
+ return (this->*gemm_reshaped_configs_G7x[data_type])(m, n, k, b);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if(n <= 4)
+ {
+ return configure_gemm_reshaped(m, n, 4, 2, 8, 16, 16, true, false);
+ }
+ else
+ {
+ return configure_gemm_reshaped(m, n, 5, 4, 4, 2, 16, false, true);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if(dot8_supported(CLKernelLibrary::get().get_device()))
+ {
+ if(n <= 4)
+ {
+ return configure_gemm_reshaped(m, n, 4, 2, 16, 2, 2, true, false);
+ }
+ else
+ {
+ return configure_gemm_reshaped(m, n, 4, 4, 16, 2, 2, true, false);
+ }
+ }
+ else
+ {
+ if(n <= 4)
+ {
+ return configure_gemm_reshaped(m, n, 4, 2, 8, 2, 2, true, false);
+ }
+ else
+ {
+ return configure_gemm_reshaped(m, n, 6, 4, 4, 2, 2, true, true);
+ }
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if(n <= 4)
+ {
+ return configure_gemm_reshaped(m, n, 4, 2, 8, 16, 16, true, false);
+ }
+ else
+ {
+ return configure_gemm_reshaped(m, n, 4, 4, 2, 8, 16, false, false);
+ }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ if(n <= 4)
+ {
+ return configure_gemm_reshaped(m, n, 4, 2, 16, 4, 1, false, false);
+ }
+ else
+ {
+ return configure_gemm_reshaped(m, n, 4, 4, 16, 2, 2, false, true);
+ }
+}
+} // namespace cl_gemm
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 2b179fd..5916bb4 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -190,15 +190,19 @@
return;
}
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
try
{
+#endif /* ARM_COMPUTE_EXCEPTIONS_ENABLED */
process_workloads(*_workloads, *_feeder, _info);
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
}
catch(...)
{
_current_exception = std::current_exception();
}
-
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
_job_complete = true;
lock.unlock();
_cv.notify_one();
@@ -250,18 +254,21 @@
info.thread_id = t;
process_workloads(workloads, feeder, info);
-
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
try
{
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
for(auto &thread : _threads)
{
thread.wait();
}
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
}
catch(const std::system_error &e)
{
std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
}
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
}
#endif /* DOXYGEN_SKIP_THIS */
diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
new file mode 100644
index 0000000..79e619c
--- /dev/null
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
@@ -0,0 +1,682 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+#include <list>
+
+namespace arm_compute
+{
+namespace
+{
+Status detection_layer_validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_loc, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, input_conf, input_priorbox);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_loc->num_dimensions() > 2, "The location input tensor should be [C1, N].");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_conf->num_dimensions() > 2, "The location input tensor should be [C2, N].");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3, "The priorbox input tensor should be [C3, 2, N].");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.eta() <= 0.f && info.eta() > 1.f, "Eta should be between 0 and 1");
+
+ const int num_priors = input_priorbox->tensor_shape()[0] / 4;
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_loc_classes() * 4)) != input_loc->tensor_shape()[0], "Number of priors must match number of location predictions.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_classes())) != input_conf->tensor_shape()[0], "Number of priors must match number of confidence predictions.");
+
+ // Validate configured output
+ if(output->total_size() != 0)
+ {
+ const unsigned int max_size = info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), TensorShape(7U, max_size));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, output);
+ }
+
+ return Status{};
+}
+
+/** Function used to sort pair<float, T> in descend order based on the score (first) value.
+ */
+template <typename T>
+bool SortScorePairDescend(const std::pair<float, T> &pair1,
+ const std::pair<float, T> &pair2)
+{
+ return pair1.first > pair2.first;
+}
+
+/** Get location predictions from input_loc.
+ *
+ * @param[in] input_loc The input location prediction.
+ * @param[in] num The number of images.
+ * @param[in] num_priors number of predictions per class.
+ * @param[in] num_loc_classes number of location classes. It is 1 if share_location is true,
+ * and is equal to number of classes needed to predict otherwise.
+ * @param[in] share_location If true, all classes share the same location prediction.
+ * @param[out] all_location_predictions All the location predictions.
+ *
+ */
+void retrieve_all_loc_predictions(const ITensor *input_loc, const int num,
+ const int num_priors, const int num_loc_classes,
+ const bool share_location, std::vector<LabelBBox> &all_location_predictions)
+{
+ for(int i = 0; i < num; ++i)
+ {
+ for(int c = 0; c < num_loc_classes; ++c)
+ {
+ int label = share_location ? -1 : c;
+ if(all_location_predictions[i].find(label) == all_location_predictions[i].end())
+ {
+ all_location_predictions[i][label].resize(num_priors);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON(all_location_predictions[i][label].size() != static_cast<size_t>(num_priors));
+ break;
+ }
+ }
+ }
+ for(int i = 0; i < num; ++i)
+ {
+ for(int p = 0; p < num_priors; ++p)
+ {
+ for(int c = 0; c < num_loc_classes; ++c)
+ {
+ const int label = share_location ? -1 : c;
+ const int base_ptr = i * num_priors * num_loc_classes * 4 + p * num_loc_classes * 4 + c * 4;
+ //xmin, ymin, xmax, ymax
+ all_location_predictions[i][label][p][0] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr)));
+ all_location_predictions[i][label][p][1] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 1)));
+ all_location_predictions[i][label][p][2] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 2)));
+ all_location_predictions[i][label][p][3] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 3)));
+ }
+ }
+ }
+}
+
+/** Get confidence predictions from input_conf.
+ *
+ * @param[in] input_loc The input location prediction.
+ * @param[in] num The number of images.
+ * @param[in] num_priors Number of predictions per class.
+ * @param[in] num_loc_classes Number of location classes. It is 1 if share_location is true,
+ * and is equal to number of classes needed to predict otherwise.
+ * @param[out] all_location_predictions All the location predictions.
+ *
+ */
+void retrieve_all_conf_scores(const ITensor *input_conf, const int num,
+ const int num_priors, const int num_classes,
+ std::vector<std::map<int, std::vector<float>>> &all_confidence_scores)
+{
+ std::vector<float> tmp_buffer;
+ tmp_buffer.resize(num * num_priors * num_classes);
+ for(int i = 0; i < num; ++i)
+ {
+ for(int c = 0; c < num_classes; ++c)
+ {
+ for(int p = 0; p < num_priors; ++p)
+ {
+ tmp_buffer[i * num_classes * num_priors + c * num_priors + p] =
+ *reinterpret_cast<float *>(input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c)));
+ }
+ }
+ }
+ for(int i = 0; i < num; ++i)
+ {
+ for(int c = 0; c < num_classes; ++c)
+ {
+ all_confidence_scores[i][c].resize(num_priors);
+ all_confidence_scores[i][c].assign(&tmp_buffer[i * num_classes * num_priors + c * num_priors],
+ &tmp_buffer[i * num_classes * num_priors + c * num_priors + num_priors]);
+ }
+ }
+}
+
+/** Get prior boxes from input_priorbox.
+ *
+ * @param[in] input_priorbox The input location prediction.
+ * @param[in] num_priors Number of priors.
+ * @param[in] num_loc_classes number of location classes. It is 1 if share_location is true,
+ * and is equal to number of classes needed to predict otherwise.
+ * @param[out] all_prior_bboxes If true, all classes share the same location prediction.
+ * @param[out] all_location_predictions All the location predictions.
+ *
+ */
+void retrieve_all_priorbox(const ITensor *input_priorbox,
+ const int num_priors,
+ std::vector<NormalizedBBox> &all_prior_bboxes,
+ std::vector<std::array<float, 4>> &all_prior_variances)
+{
+ for(int i = 0; i < num_priors; ++i)
+ {
+ all_prior_bboxes[i] =
+ {
+ {
+ *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4))),
+ *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))),
+ *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))),
+ *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3)))
+ }
+ };
+ }
+
+ std::array<float, 4> var({ { 0, 0, 0, 0 } });
+ for(int i = 0; i < num_priors; ++i)
+ {
+ for(int j = 0; j < 4; ++j)
+ {
+ var[j] = *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates((num_priors + i) * 4 + j)));
+ }
+ all_prior_variances[i] = var;
+ }
+}
+
+/** Decode a bbox according to a prior bbox.
+ *
+ * @param[in] prior_bbox The input prior bounding boxes.
+ * @param[in] prior_variance The corresponding input variance.
+ * @param[in] code_type The detection output code type used to decode the results.
+ * @param[in] variance_encoded_in_target If true, the variance is encoded in target.
+ * @param[in] clip_bbox If true, the results should be between 0.f and 1.f.
+ * @param[in] bbox The input bbox to decode
+ * @param[out] decode_bbox The decoded bboxes.
+ *
+ */
+void DecodeBBox(const NormalizedBBox &prior_bbox, const std::array<float, 4> &prior_variance,
+ const DetectionOutputLayerCodeType code_type, const bool variance_encoded_in_target,
+ const bool clip_bbox, const NormalizedBBox &bbox, NormalizedBBox &decode_bbox)
+{
+ // if the variance is encoded in target, we simply need to add the offset predictions
+ // otherwise we need to scale the offset accordingly.
+ switch(code_type)
+ {
+ case DetectionOutputLayerCodeType::CORNER:
+ {
+ decode_bbox[0] = prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]);
+ decode_bbox[1] = prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]);
+ decode_bbox[2] = prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]);
+ decode_bbox[3] = prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]);
+
+ break;
+ }
+ case DetectionOutputLayerCodeType::CENTER_SIZE:
+ {
+ const float prior_width = prior_bbox[2] - prior_bbox[0];
+ const float prior_height = prior_bbox[3] - prior_bbox[1];
+
+ // Check if the prior width and height are right
+ ARM_COMPUTE_ERROR_ON(prior_width <= 0.f);
+ ARM_COMPUTE_ERROR_ON(prior_height <= 0.f);
+
+ const float prior_center_x = (prior_bbox[0] + prior_bbox[2]) / 2.;
+ const float prior_center_y = (prior_bbox[1] + prior_bbox[3]) / 2.;
+
+ const float decode_bbox_center_x = (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x;
+ const float decode_bbox_center_y = (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y;
+ const float decode_bbox_width = (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width;
+ const float decode_bbox_height = (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height;
+
+ decode_bbox[0] = (decode_bbox_center_x - decode_bbox_width / 2.f);
+ decode_bbox[1] = (decode_bbox_center_y - decode_bbox_height / 2.f);
+ decode_bbox[2] = (decode_bbox_center_x + decode_bbox_width / 2.f);
+ decode_bbox[3] = (decode_bbox_center_y + decode_bbox_height / 2.f);
+
+ break;
+ }
+ case DetectionOutputLayerCodeType::CORNER_SIZE:
+ {
+ const float prior_width = prior_bbox[2] - prior_bbox[0];
+ const float prior_height = prior_bbox[3] - prior_bbox[1];
+
+ // Check if the prior width and height are greater than 0
+ ARM_COMPUTE_ERROR_ON(prior_width <= 0.f);
+ ARM_COMPUTE_ERROR_ON(prior_height <= 0.f);
+
+ decode_bbox[0] = prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width;
+ decode_bbox[1] = prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height;
+ decode_bbox[2] = prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width;
+ decode_bbox[3] = prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height;
+
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported Detection Output Code Type.");
+ }
+
+ if(clip_bbox)
+ {
+ for(auto &d_bbox : decode_bbox)
+ {
+ d_bbox = utility::clamp(d_bbox, 0.f, 1.f);
+ }
+ }
+}
+
+/** Do non maximum suppression given bboxes and scores.
+ *
+ * @param[in] bboxes The input bounding boxes.
+ * @param[in] scores The corresponding input confidence.
+ * @param[in] score_threshold The threshold used to filter detection results.
+ * @param[in] nms_threshold The threshold used in non maximum suppression.
+ * @param[in] eta Adaptation rate for nms threshold.
+ * @param[in] top_k If not -1, keep at most top_k picked indices.
+ * @param[out] indices The kept indices of bboxes after nms.
+ *
+ */
+void ApplyNMSFast(const std::vector<NormalizedBBox> &bboxes,
+ const std::vector<float> &scores, const float score_threshold,
+ const float nms_threshold, const float eta, const int top_k,
+ std::vector<int> &indices)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(bboxes.size() != scores.size(), "bboxes and scores have different size.");
+
+ // Get top_k scores (with corresponding indices).
+ std::list<std::pair<float, int>> score_index_vec;
+
+ // Generate index score pairs.
+ for(size_t i = 0; i < scores.size(); ++i)
+ {
+ if(scores[i] > score_threshold)
+ {
+ score_index_vec.emplace_back(std::make_pair(scores[i], i));
+ }
+ }
+
+ // Sort the score pair according to the scores in descending order
+ score_index_vec.sort(SortScorePairDescend<int>);
+
+ // Keep top_k scores if needed.
+ const int score_index_vec_size = score_index_vec.size();
+ if(top_k > -1 && top_k < score_index_vec_size)
+ {
+ score_index_vec.resize(top_k);
+ }
+
+ // Do nms.
+ float adaptive_threshold = nms_threshold;
+ indices.clear();
+
+ while(!score_index_vec.empty())
+ {
+ const int idx = score_index_vec.front().second;
+ bool keep = true;
+ for(int kept_idx : indices)
+ {
+ if(keep)
+ {
+ // Compute the jaccard (intersection over union IoU) overlap between two bboxes.
+ NormalizedBBox intersect_bbox = std::array<float, 4>({ { 0, 0, 0, 0 } });
+ if(bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] || bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1])
+ {
+ intersect_bbox = std::array<float, 4>({ { 0, 0, 0, 0 } });
+ }
+ else
+ {
+ intersect_bbox = std::array<float, 4>({ {
+ std::max(bboxes[idx][0], bboxes[kept_idx][0]),
+ std::max(bboxes[idx][1], bboxes[kept_idx][1]),
+ std::min(bboxes[idx][2], bboxes[kept_idx][2]),
+ std::min(bboxes[idx][3], bboxes[kept_idx][3])
+ }
+ });
+ }
+
+ float intersect_width = intersect_bbox[2] - intersect_bbox[0];
+ float intersect_height = intersect_bbox[3] - intersect_bbox[1];
+
+ float overlap = 0.f;
+ if(intersect_width > 0 && intersect_height > 0)
+ {
+ float intersect_size = intersect_width * intersect_height;
+ float bbox1_size = (bboxes[idx][2] < bboxes[idx][0]
+ || bboxes[idx][3] < bboxes[idx][1]) ?
+ 0.f :
+ (bboxes[idx][2] - bboxes[idx][0]) * (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]);
+ float bbox2_size = (bboxes[kept_idx][2] < bboxes[kept_idx][0]
+ || bboxes[kept_idx][3] < bboxes[kept_idx][1]) ?
+ 0.f :
+ (bboxes[kept_idx][2] - bboxes[kept_idx][0]) * (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]);
+ overlap = intersect_size / (bbox1_size + bbox2_size - intersect_size);
+ }
+ keep = (overlap <= adaptive_threshold);
+ }
+ else
+ {
+ break;
+ }
+ }
+ if(keep)
+ {
+ indices.push_back(idx);
+ }
+ score_index_vec.erase(score_index_vec.begin());
+ if(keep && eta < 1.f && adaptive_threshold > 0.5f)
+ {
+ adaptive_threshold *= eta;
+ }
+ }
+}
+
+Status non_max_suppression_validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
+ const float score_threshold, const float nms_threshold)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(bboxes, scores, indices);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bboxes, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2, "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes].");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "The scores tensor must be a 1-D float tensor of shape [num_boxes].");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices->num_dimensions() > 1, "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M");
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bboxes, scores);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "Scores must be a 1D float tensor");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices->dimension(0) == 0, "Indices tensor must be bigger than 0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(max_output_size == 0, "Max size cannot be 0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(nms_threshold < 0.f || nms_threshold > 1.f, "Threshould must be in [0,1]");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(score_threshold < 0.f || score_threshold > 1.f, "Threshould must be in [0,1]");
+
+ return Status{};
+}
+} // namespace
+
+CPPNonMaximumSuppression::CPPNonMaximumSuppression()
+ : _bboxes(nullptr), _scores(nullptr), _indices(nullptr), _max_output_size(0), _score_threshold(0.f), _nms_threshold(0.f)
+{
+}
+
+void CPPNonMaximumSuppression::configure(
+ const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size,
+ const float score_threshold, const float nms_threshold)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(bboxes, scores, indices);
+ ARM_COMPUTE_ERROR_THROW_ON(non_max_suppression_validate_arguments(bboxes->info(), scores->info(), indices->info(), max_output_size, score_threshold, nms_threshold));
+
+ // copy scores also to a vector
+ _bboxes = bboxes;
+ _scores = scores;
+ _indices = indices;
+
+ _nms_threshold = nms_threshold;
+ _max_output_size = max_output_size;
+ _score_threshold = score_threshold;
+}
+
+Status CPPNonMaximumSuppression::validate(
+ const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
+ const float score_threshold, const float nms_threshold)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(non_max_suppression_validate_arguments(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold));
+ return Status{};
+}
+
+void extract_bounding_boxes_from_tensor(const ITensor *bboxes, std::vector<NormalizedBBox> &bboxes_vector)
+{
+ Window input_win;
+ input_win.use_tensor_dimensions(bboxes->info()->tensor_shape());
+ input_win.set_dimension_step(0U, 4U);
+ input_win.set_dimension_step(1U, 1U);
+ Iterator input(bboxes, input_win);
+ auto f = [&bboxes_vector, &input](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
+ bboxes_vector.push_back(NormalizedBBox({ { *input_ptr, *(input_ptr + 1), *(2 + input_ptr), *(3 + input_ptr) } }));
+ };
+ execute_window_loop(input_win, f, input);
+}
+
+void extract_scores_from_tensor(const ITensor *scores, std::vector<float> &scores_vector)
+{
+ Window window;
+ window.use_tensor_dimensions(scores->info()->tensor_shape());
+ Iterator it(scores, window);
+ auto f = [&it, &scores_vector](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const float *>(it.ptr());
+ scores_vector.push_back(*input_ptr);
+ };
+ execute_window_loop(window, f, it);
+}
+
+void CPPNonMaximumSuppression::run()
+{
+ std::vector<NormalizedBBox> bboxes_vector;
+ std::vector<float> scores_vector;
+ std::vector<int> indices_vector;
+ extract_bounding_boxes_from_tensor(_bboxes, bboxes_vector);
+ extract_scores_from_tensor(_scores, scores_vector);
+ ApplyNMSFast(bboxes_vector, scores_vector, _score_threshold, _nms_threshold, 1, -1 /* disable top_k */, indices_vector);
+ std::copy_n(indices_vector.begin(), std::min(indices_vector.size(), _indices->info()->dimension(0)), reinterpret_cast<int *>(_indices->ptr_to_element(Coordinates(0))));
+}
+
+CPPDetectionOutputLayer::CPPDetectionOutputLayer()
+ : _input_loc(nullptr), _input_conf(nullptr), _input_priorbox(nullptr), _output(nullptr), _info(), _num_priors(), _num(), _all_location_predictions(), _all_confidence_scores(), _all_prior_bboxes(),
+ _all_prior_variances(), _all_decode_bboxes(), _all_indices()
+{
+}
+
+void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor *input_conf, const ITensor *input_priorbox, ITensor *output, DetectionOutputLayerInfo info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
+ // Output auto initialization if not yet initialized
+ // Since the number of bboxes to kept is unknown before nms, the shape is set to the maximum
+ // The maximum is keep_top_k * input_loc_size[1]
+ // Each row is a 7 dimension std::vector, which stores [image_id, label, confidence, xmin, ymin, xmax, ymax]
+ const unsigned int max_size = info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1);
+ auto_init_if_empty(*output->info(), input_loc->info()->clone()->set_tensor_shape(TensorShape(7U, max_size)));
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(detection_layer_validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info));
+
+ _input_loc = input_loc;
+ _input_conf = input_conf;
+ _input_priorbox = input_priorbox;
+ _output = output;
+ _info = info;
+ _num_priors = input_priorbox->info()->dimension(0) / 4;
+ _num = (_input_loc->info()->num_dimensions() > 1 ? _input_loc->info()->dimension(1) : 1);
+
+ _all_location_predictions.resize(_num);
+ _all_confidence_scores.resize(_num);
+ _all_prior_bboxes.resize(_num_priors);
+ _all_prior_variances.resize(_num_priors);
+ _all_decode_bboxes.resize(_num);
+
+ for(int i = 0; i < _num; ++i)
+ {
+ for(int c = 0; c < _info.num_loc_classes(); ++c)
+ {
+ const int label = _info.share_location() ? -1 : c;
+ if(label == _info.background_label_id())
+ {
+ // Ignore background class.
+ continue;
+ }
+ _all_decode_bboxes[i][label].resize(_num_priors);
+ }
+ }
+ _all_indices.resize(_num);
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+}
+
+Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(detection_layer_validate_arguments(input_loc, input_conf, input_priorbox, output, info));
+ return Status{};
+}
+
+void CPPDetectionOutputLayer::run()
+{
+ // Retrieve all location predictions.
+ retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(), _all_location_predictions);
+
+ // Retrieve all confidences.
+ retrieve_all_conf_scores(_input_conf, _num, _num_priors, _info.num_classes(), _all_confidence_scores);
+
+ // Retrieve all prior bboxes.
+ retrieve_all_priorbox(_input_priorbox, _num_priors, _all_prior_bboxes, _all_prior_variances);
+
+ // Decode all loc predictions to bboxes
+ const bool clip_bbox = false;
+ for(int i = 0; i < _num; ++i)
+ {
+ for(int c = 0; c < _info.num_loc_classes(); ++c)
+ {
+ const int label = _info.share_location() ? -1 : c;
+ if(label == _info.background_label_id())
+ {
+ // Ignore background class.
+ continue;
+ }
+ ARM_COMPUTE_ERROR_ON_MSG(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(), "Could not find location predictions for label %d.", label);
+
+ const std::vector<NormalizedBBox> &label_loc_preds = _all_location_predictions[i].find(label)->second;
+
+ const int num_bboxes = _all_prior_bboxes.size();
+ ARM_COMPUTE_ERROR_ON(_all_prior_variances[i].size() != 4);
+
+ for(int j = 0; j < num_bboxes; ++j)
+ {
+ DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(), _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j], _all_decode_bboxes[i][label][j]);
+ }
+ }
+ }
+
+ int num_kept = 0;
+
+ for(int i = 0; i < _num; ++i)
+ {
+ const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
+ const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
+
+ std::map<int, std::vector<int>> indices;
+ int num_det = 0;
+ for(int c = 0; c < _info.num_classes(); ++c)
+ {
+ if(c == _info.background_label_id())
+ {
+ // Ignore background class
+ continue;
+ }
+ const int label = _info.share_location() ? -1 : c;
+ if(conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end())
+ {
+ ARM_COMPUTE_ERROR("Could not find predictions for label %d.", label);
+ }
+ const std::vector<float> &scores = conf_scores.find(c)->second;
+ const std::vector<NormalizedBBox> &bboxes = decode_bboxes.find(label)->second;
+
+ ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(), _info.top_k(), indices[c]);
+
+ num_det += indices[c].size();
+ }
+
+ int num_to_add = 0;
+ if(_info.keep_top_k() > -1 && num_det > _info.keep_top_k())
+ {
+ std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+ for(auto it : indices)
+ {
+ const int label = it.first;
+ const std::vector<int> &label_indices = it.second;
+
+ if(conf_scores.find(label) == conf_scores.end())
+ {
+ ARM_COMPUTE_ERROR("Could not find predictions for label %d.", label);
+ }
+
+ const std::vector<float> &scores = conf_scores.find(label)->second;
+ for(auto idx : label_indices)
+ {
+ ARM_COMPUTE_ERROR_ON(idx > static_cast<int>(scores.size()));
+ score_index_pairs.push_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
+ }
+ }
+
+ // Keep top k results per image.
+ std::sort(score_index_pairs.begin(), score_index_pairs.end(), SortScorePairDescend<std::pair<int, int>>);
+ score_index_pairs.resize(_info.keep_top_k());
+
+ // Store the new indices.
+
+ std::map<int, std::vector<int>> new_indices;
+ for(auto score_index_pair : score_index_pairs)
+ {
+ int label = score_index_pair.second.first;
+ int idx = score_index_pair.second.second;
+ new_indices[label].push_back(idx);
+ }
+ _all_indices[i] = new_indices;
+ num_to_add = _info.keep_top_k();
+ }
+ else
+ {
+ _all_indices[i] = indices;
+ num_to_add = num_det;
+ }
+ num_kept += num_to_add;
+ }
+
+ //Update the valid region of the ouput to mark the exact number of detection
+ _output->info()->set_valid_region(ValidRegion(Coordinates(0, 0), TensorShape(7, num_kept)));
+
+ int count = 0;
+ for(int i = 0; i < _num; ++i)
+ {
+ const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
+ const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
+ for(auto &it : _all_indices[i])
+ {
+ const int label = it.first;
+ const std::vector<float> &scores = conf_scores.find(label)->second;
+ const int loc_label = _info.share_location() ? -1 : label;
+ if(conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end())
+ {
+ // Either if there are no confidence predictions
+ // or there are no location predictions for current label.
+ ARM_COMPUTE_ERROR("Could not find predictions for the label %d.", label);
+ }
+ const std::vector<NormalizedBBox> &bboxes = decode_bboxes.find(loc_label)->second;
+ const std::vector<int> &indices = it.second;
+
+ for(auto idx : indices)
+ {
+ *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7)))) = i;
+ *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 1)))) = label;
+ *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 2)))) = scores[idx];
+ *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 3)))) = bboxes[idx][0];
+ *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 4)))) = bboxes[idx][1];
+ *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 5)))) = bboxes[idx][2];
+ *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 6)))) = bboxes[idx][3];
+
+ ++count;
+ }
+ }
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/CPP/functions/CPPTopKV.cpp b/src/runtime/CPP/functions/CPPTopKV.cpp
new file mode 100644
index 0000000..c4e1eab
--- /dev/null
+++ b/src/runtime/CPP/functions/CPPTopKV.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/functions/CPPTopKV.h"
+
+#include "arm_compute/core/CPP/kernels/CPPTopKVKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CPPTopKV::configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k)
+{
+ auto kernel = arm_compute::support::cpp14::make_unique<CPPTopKVKernel>();
+ kernel->configure(predictions, targets, output, k);
+ _kernel = std::move(kernel);
+}
+
+Status CPPTopKV::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+{
+ return CPPTopKVKernel::validate(predictions, targets, output, k);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
index ac19d08..f3355a7 100644
--- a/src/runtime/CPUUtils.cpp
+++ b/src/runtime/CPUUtils.cpp
@@ -39,7 +39,8 @@
#include <unistd.h>
#ifndef BARE_METAL
-#include <regex>
+/* C++ std::regex takes up a lot of space in the standalone builds */
+#include <regex.h>
#include <thread>
#endif /* BARE_METAL */
@@ -94,6 +95,7 @@
return false;
}
}
+
/* Convert an MIDR register value to a CPUModel enum value. */
CPUModel midr_to_model(const unsigned int midr)
{
@@ -144,6 +146,19 @@
break;
}
}
+ else if(implementer == 0x48) // HiSilicon CPUs
+ {
+ // Only CPUs we have code paths for are detected. All other CPUs can be safely classed as "GENERIC"
+ switch(cpunum)
+ {
+ case 0xd40: // A76 (Kirin 980)
+ model = CPUModel::GENERIC_FP16_DOT;
+ break;
+ default:
+ model = CPUModel::GENERIC;
+ break;
+ }
+ }
return model;
}
@@ -172,12 +187,27 @@
void populate_models_cpuinfo(std::vector<CPUModel> &cpusv)
{
+ regex_t proc_regex;
+ regex_t imp_regex;
+ regex_t var_regex;
+ regex_t part_regex;
+ regex_t rev_regex;
+
+ memset(&proc_regex, 0, sizeof(regex_t));
+ memset(&imp_regex, 0, sizeof(regex_t));
+ memset(&var_regex, 0, sizeof(regex_t));
+ memset(&part_regex, 0, sizeof(regex_t));
+ memset(&rev_regex, 0, sizeof(regex_t));
+
+ int ret_status = 0;
// If "long-form" cpuinfo is present, parse that to populate models.
- std::regex proc_regex(R"(^processor.*(\d+)$)");
- std::regex imp_regex(R"(^CPU implementer.*0x(..)$)");
- std::regex var_regex(R"(^CPU variant.*0x(.)$)");
- std::regex part_regex(R"(^CPU part.*0x(...)$)");
- std::regex rev_regex(R"(^CPU revision.*(\d+)$)");
+ ret_status |= regcomp(&proc_regex, R"(^processor.*([[:digit:]]+)$)", REG_EXTENDED);
+ ret_status |= regcomp(&imp_regex, R"(^CPU implementer.*0x(..)$)", REG_EXTENDED);
+ ret_status |= regcomp(&var_regex, R"(^CPU variant.*0x(.)$)", REG_EXTENDED);
+ ret_status |= regcomp(&part_regex, R"(^CPU part.*0x(...)$)", REG_EXTENDED);
+ ret_status |= regcomp(&rev_regex, R"(^CPU revision.*([[:digit:]]+)$)", REG_EXTENDED);
+ ARM_COMPUTE_UNUSED(ret_status);
+ ARM_COMPUTE_ERROR_ON_MSG(ret_status != 0, "Regex compilation failed.");
std::ifstream file;
file.open("/proc/cpuinfo", std::ios::in);
@@ -190,11 +220,11 @@
while(bool(getline(file, line)))
{
- std::smatch match;
-
- if(std::regex_match(line, match, proc_regex))
+ regmatch_t match[2];
+ ret_status = regexec(&proc_regex, line.c_str(), 2, match, 0);
+ if(ret_status == 0)
{
- std::string id = match[1];
+ std::string id = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
int newcpu = support::cpp11::stoi(id, nullptr);
if(curcpu >= 0 && midr == 0)
@@ -214,32 +244,44 @@
continue;
}
- if(std::regex_match(line, match, imp_regex))
+ ret_status = regexec(&imp_regex, line.c_str(), 2, match, 0);
+ if(ret_status == 0)
{
- int impv = support::cpp11::stoi(match[1], nullptr, support::cpp11::NumericBase::BASE_16);
+ std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
+ int impv = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
midr |= (impv << 24);
+
continue;
}
- if(std::regex_match(line, match, var_regex))
+ ret_status = regexec(&var_regex, line.c_str(), 2, match, 0);
+ if(ret_status == 0)
{
- int varv = support::cpp11::stoi(match[1], nullptr, support::cpp11::NumericBase::BASE_16);
+ std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
+ int varv = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
midr |= (varv << 20);
+
continue;
}
- if(std::regex_match(line, match, part_regex))
+ ret_status = regexec(&part_regex, line.c_str(), 2, match, 0);
+ if(ret_status == 0)
{
- int partv = support::cpp11::stoi(match[1], nullptr, support::cpp11::NumericBase::BASE_16);
+ std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
+ int partv = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
midr |= (partv << 4);
+
continue;
}
- if(std::regex_match(line, match, rev_regex))
+ ret_status = regexec(&rev_regex, line.c_str(), 2, match, 0);
+ if(ret_status == 0)
{
- int regv = support::cpp11::stoi(match[1], nullptr);
+ std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
+ int regv = support::cpp11::stoi(subexp, nullptr);
midr |= (regv);
midr |= (0xf << 16);
+
continue;
}
}
@@ -249,6 +291,13 @@
cpusv[curcpu] = midr_to_model(midr);
}
}
+
+ // Free allocated memory
+ regfree(&proc_regex);
+ regfree(&imp_regex);
+ regfree(&var_regex);
+ regfree(&part_regex);
+ regfree(&rev_regex);
}
int get_max_cpus()
@@ -364,8 +413,11 @@
std::map<std::string, unsigned int> cpu_part_occurrence_map;
// CPU part regex
- std::regex cpu_part_rgx(R"(.*CPU part.+?(?=:).+?(?=\w+)(\w+).*)");
- std::smatch cpu_part_match;
+ regex_t cpu_part_rgx;
+ memset(&cpu_part_rgx, 0, sizeof(regex_t));
+ int ret_status = regcomp(&cpu_part_rgx, R"(.*CPU part.+/?\:[[:space:]]+([[:alnum:]]+).*)", REG_EXTENDED);
+ ARM_COMPUTE_UNUSED(ret_status);
+ ARM_COMPUTE_ERROR_ON_MSG(ret_status != 0, "Regex compilation failed.");
// Read cpuinfo and get occurrence of each core
std::ifstream cpuinfo;
@@ -375,9 +427,11 @@
std::string line;
while(bool(getline(cpuinfo, line)))
{
- if(std::regex_search(line.cbegin(), line.cend(), cpu_part_match, cpu_part_rgx))
+ regmatch_t match[2];
+ ret_status = regexec(&cpu_part_rgx, line.c_str(), 2, match, 0);
+ if(ret_status == 0)
{
- std::string cpu_part = cpu_part_match[1];
+ std::string cpu_part = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
if(cpu_part_occurrence_map.find(cpu_part) != cpu_part_occurrence_map.end())
{
cpu_part_occurrence_map[cpu_part]++;
@@ -389,6 +443,7 @@
}
}
}
+ regfree(&cpu_part_rgx);
// Get min number of threads
auto min_common_cores = std::min_element(cpu_part_occurrence_map.begin(), cpu_part_occurrence_map.end(),
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
index c58d184..a35a18a 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -170,7 +170,7 @@
{
BorderSize border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
input->info()->extend_padding(border_size);
- _fill_border.configure(input, border_size, BorderMode::CONSTANT, PixelValue(0)); // for PAD of im2col fp16: consider it as border
+ _fill_border.configure(input, border_size, BorderMode::CONSTANT, PixelValue()); // for PAD of im2col fp16: consider it as border
}
// Configure im2col
_input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation);
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
index 689d8be..aa937a6 100755
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -53,7 +53,7 @@
for(unsigned int i = 0; i < _num_inputs; i++)
{
_concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
- _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue());
depth_offset += inputs_vector.at(i)->info()->dimension(2);
}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
index d9aa50d..ba05838 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,7 +43,7 @@
_kernel = std::move(k);
// Configure border handler
- _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
_shift_handler.configure(input);
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
index c0cf098..cb14b8a 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -68,7 +68,7 @@
return;
}
- _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
_shift_handler.configure(input);
}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
index b2e69ee..2569365 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -48,7 +48,7 @@
_norm_kernel.configure(input, &_squared_input, output, norm_info);
_multiply_kernel.configure(input, input, &_squared_input, 1.0f);
// Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
- _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue());
// Allocate intermediate buffers
_squared_input.allocator()->allocate();
diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp
index 7d928d6..97c20d1 100644
--- a/src/runtime/ISimpleLifetimeManager.cpp
+++ b/src/runtime/ISimpleLifetimeManager.cpp
@@ -59,7 +59,7 @@
// Check if there is a free blob
if(_free_blobs.empty())
{
- _occupied_blobs.emplace_front(Blob{ obj, 0, { obj } });
+ _occupied_blobs.emplace_front(Blob{ obj, 0, 0, { obj } });
}
else
{
@@ -71,7 +71,7 @@
_active_elements.insert(std::make_pair(obj, obj));
}
-void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t size)
+void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t size, size_t alignment)
{
ARM_COMPUTE_ERROR_ON(obj == nullptr);
@@ -80,10 +80,11 @@
ARM_COMPUTE_ERROR_ON(active_object_it == std::end(_active_elements));
// Update object fields and mark object as complete
- Element &el = active_object_it->second;
- el.handle = &obj_memory;
- el.size = size;
- el.status = true;
+ Element &el = active_object_it->second;
+ el.handle = &obj_memory;
+ el.size = size;
+ el.alignment = alignment;
+ el.status = true;
// Find object in the occupied lists
auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), [&obj](const Blob & b)
@@ -94,8 +95,9 @@
// Update occupied blob and return as free
occupied_blob_it->bound_elements.insert(obj);
- occupied_blob_it->max_size = std::max(occupied_blob_it->max_size, size);
- occupied_blob_it->id = nullptr;
+ occupied_blob_it->max_size = std::max(occupied_blob_it->max_size, size);
+ occupied_blob_it->max_alignment = std::max(occupied_blob_it->max_alignment, alignment);
+ occupied_blob_it->id = nullptr;
_free_blobs.splice(std::begin(_free_blobs), _occupied_blobs, occupied_blob_it);
// Check if all object are finalized and reset active group
diff --git a/src/runtime/MEMUtils.cpp b/src/runtime/MEMUtils.cpp
index ad00070..5ae1c2a 100644
--- a/src/runtime/MEMUtils.cpp
+++ b/src/runtime/MEMUtils.cpp
@@ -27,7 +27,7 @@
#ifndef BARE_METAL
#include <fstream>
-#include <regex>
+#include <iterator>
#include <sstream>
#endif // ifndef BARE_METAL
@@ -43,41 +43,33 @@
size_t memfree = 0;
std::ifstream meminfo_f;
meminfo_f.open("/proc/meminfo", std::ios::in);
+
if(meminfo_f.is_open())
{
- std::stringstream str_stream;
- str_stream << meminfo_f.rdbuf();
- const std::string str = str_stream.str();
- try
+ std::string line;
+ while(bool(getline(meminfo_f, line)))
{
- std::smatch match;
- if(std::regex_search(str, match, std::regex("MemTotal: (.*)kB")) && match.size() > 1)
+ std::istringstream iss(line);
+ std::vector<std::string> tokens((std::istream_iterator<std::string>(iss)),
+ std::istream_iterator<std::string>());
+ if(tokens[0] == "MemTotal:")
{
- const std::string result = match.str(1);
- total = std::stoul(result, nullptr, 0);
+ total = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
}
- if(std::regex_search(str, match, std::regex("MemFree: (.*)kB")) && match.size() > 1)
+ else if(tokens[0] == "MemFree:")
{
- const std::string result = match.str(1);
- memfree = std::stoul(result, nullptr, 0);
+ memfree = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
}
- if(std::regex_search(str, match, std::regex("Buffers: (.*)kB")) && match.size() > 1)
+ else if(tokens[0] == "Buffers:")
{
- const std::string result = match.str(1);
- buffer = std::stoul(result, nullptr, 0);
+ buffer = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
}
- if(std::regex_search(str, match, std::regex("Cached: (.*)kB")) && match.size() > 1)
+ else if(tokens[0] == "Cached:")
{
- const std::string result = match.str(1);
- memcache = std::stoul(result, nullptr, 0);
+ memcache = arm_compute::support::cpp11::stoul(tokens[1], nullptr);
}
- free = memfree + (buffer + memcache);
}
- catch(std::regex_error &e)
- {
- // failed parsing /proc/meminfo
- // return 0s on all fields
- }
+ free = memfree + (buffer + memcache);
}
#endif // ifndef BARE_METAL
}
diff --git a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
new file mode 100644
index 0000000..1287204
--- /dev/null
+++ b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+INESimpleFunctionNoBorder::INESimpleFunctionNoBorder() // NOLINT
+ : _kernel()
+{
+}
+
+void INESimpleFunctionNoBorder::run()
+{
+ NEScheduler::get().schedule(_kernel.get(), Window::DimY);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
new file mode 100644
index 0000000..d33e134
--- /dev/null
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reduction_kernel(), _fill_border_kernel(), _run_fill_border(false)
+{
+}
+void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op)
+{
+ _reduction_kernel.configure(input, output, axis, op);
+
+ if(axis == 0)
+ {
+ _fill_border_kernel.configure(input, _reduction_kernel.border_size(), BorderMode::REPLICATE);
+ _run_fill_border = true;
+ }
+}
+
+Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
+ ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, axis, op));
+ return Status{};
+}
+
+void NEArgMinMaxLayer::run()
+{
+ _memory_group.acquire();
+
+ if(_run_fill_border)
+ {
+ NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+ }
+ NEScheduler::get().schedule(&_reduction_kernel, Window::DimY);
+
+ _memory_group.release();
+}
+
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index 677e9f6..b155077 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,16 +36,6 @@
auto k = arm_compute::support::cpp14::make_unique<NEArithmeticAdditionKernel>();
k->configure(input1, input2, output, policy);
_kernel = std::move(k);
-
- if(output->info()->dimension(0) > 1)
- {
- ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
- if(broadcasted_info->info()->dimension(0) == 1)
- {
- _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
- }
- }
}
Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
{
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 931e5db..5059162 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -148,8 +148,7 @@
return (*found).second;
}
- if(dilation != Size2D(1U, 1U) || Scheduler::get().cpu_info().get_cpu_model() == CPUModel::A53
- || input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) <= 16)
+ if(dilation != Size2D(1U, 1U) || input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) <= 16)
{
return ConvolutionMethod::GEMM;
}
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 6887a0a..44d7197 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -145,6 +145,15 @@
_conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
_scaled_output.allocator()->allocate();
}
+Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info)
+{
+ return NEDeconvolutionLayer::validate(input, weights, bias, output, info, 0, 0);
+}
+
+void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info)
+{
+ configure(input, weights, bias, output, info, 0, 0);
+}
void NEDeconvolutionLayer::run()
{
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index a2f0094..f0fd4cf 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -72,7 +72,7 @@
accum_layout = DataLayout::NCHW;
}
- _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, input->info()->quantization_info()));
+ _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
_accumulator.info()->set_data_layout(accum_layout);
zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
}
@@ -271,7 +271,7 @@
const unsigned int channel_idx = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_UNUSED(channel_idx);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
ARM_COMPUTE_ERROR_ON((input->info()->dimension(channel_idx) * depth_multiplier) != weights->info()->dimension(channel_idx));
diff --git a/src/runtime/NEON/functions/NEElementwiseOperators.cpp b/src/runtime/NEON/functions/NEElementwiseOperators.cpp
new file mode 100644
index 0000000..74c1957
--- /dev/null
+++ b/src/runtime/NEON/functions/NEElementwiseOperators.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h"
+#include <arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h>
+
+#include "arm_compute/core/ITensor.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::MAX, input1, input2, output);
+ _kernel = std::move(k);
+}
+
+Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return NEArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output);
+}
+
+void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::MIN, input1, input2, output);
+ _kernel = std::move(k);
+}
+
+Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return NEArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output);
+}
+
+void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
+ k->configure(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+ _kernel = std::move(k);
+}
+
+Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return NEArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+}
+
+void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEDivisionOperationKernel>();
+ k->configure(input1, input2, output);
+ _kernel = std::move(k);
+}
+
+Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return NEDivisionOperationKernel::validate(input1, input2, output);
+}
+
+template <ComparisonOperation COP>
+void NEElementwiseComparisonStatic<COP>::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEComparisonOperationKernel>();
+ k->configure(COP, input1, input2, output);
+ _kernel = std::move(k);
+}
+
+template <ComparisonOperation COP>
+Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+ return NEComparisonOperationKernel::validate(COP, input1, input2, output);
+}
+
+void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITensor *output, ComparisonOperation op)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEComparisonOperationKernel>();
+ k->configure(op, input1, input2, output);
+ _kernel = std::move(k);
+}
+
+Status NEElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op)
+{
+ return NEComparisonOperationKernel::validate(op, input1, input2, output);
+}
+
+// Supported Specializations
+template class NEElementwiseComparisonStatic<ComparisonOperation::Equal>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::Greater>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::Less>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
new file mode 100644
index 0000000..10142c7
--- /dev/null
+++ b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NERsqrtLayer::configure(const ITensor *input, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>();
+ k->configure(ElementWiseUnary::RSQRT, input, output);
+ _kernel = std::move(k);
+}
+Status NERsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return NEElementwiseUnaryKernel::validate(ElementWiseUnary::RSQRT, input, output);
+}
+
+void NEExpLayer::configure(const ITensor *input, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernel>();
+ k->configure(ElementWiseUnary::EXP, input, output);
+ _kernel = std::move(k);
+}
+Status NEExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return NEElementwiseUnaryKernel::validate(ElementWiseUnary::EXP, input, output);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
new file mode 100644
index 0000000..dc48731
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEFuseBatchNormalization::NEFuseBatchNormalization()
+ : _fuse_bn_kernel()
+{
+}
+
+void NEFuseBatchNormalization::configure(const ITensor *conv_weights, const ITensor *bn_mean, const ITensor *bn_var,
+ ITensor *fused_weights, ITensor *fused_bias,
+ const ITensor *conv_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
+ float epsilon)
+{
+ _fuse_bn_kernel.configure(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon);
+}
+
+Status NEFuseBatchNormalization::validate(const ITensorInfo *conv_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
+ const ITensorInfo *conv_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
+ float epsilon)
+{
+ return NEFuseBatchNormalizationKernel::validate(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon);
+}
+
+void NEFuseBatchNormalization::run()
+{
+ NEScheduler::get().schedule(&_fuse_bn_kernel, Window::DimY);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 72a3e80..914f088 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,8 +91,8 @@
shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
- TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
- TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+ TensorInfo info_a = a->info()->clone()->set_tensor_shape(shape_tmp_a).set_is_resizable(true);
+ TensorInfo info_b = b->info()->clone()->set_tensor_shape(shape_tmp_b).set_is_resizable(true);
_tmp_a.allocator()->init(info_a);
_tmp_b.allocator()->init(info_b);
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 922f757..470e922 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,9 +24,6 @@
#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
#include "arm_compute/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
@@ -38,14 +35,14 @@
{
namespace
{
-std::unique_ptr<IFunction> create_function_all_types(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
+std::unique_ptr<IFunction> create_function_all_types(arm_gemm::KernelDescription gemm_kernel_info,
+ const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
std::shared_ptr<IMemoryManager> memory_manager)
{
//Note: It's safe to not check for FP16 support because this was already checked in NEGEMMAssemblyDispatch::configure()
- switch(method)
+ switch(gemm_kernel_info.method)
{
- case arm_gemm::GemmMethod::GEMM_INTERLEAVED_FP16:
case arm_gemm::GemmMethod::GEMM_INTERLEAVED:
{
if(!pretranspose_hint)
@@ -56,99 +53,41 @@
function->configure(a, b, d, alpha, beta, pretranspose_hint);
return std::move(function);
}
- default:
- return nullptr;
- }
-}
-
-template <typename TypeInput, typename TypeOutput>
-std::unique_ptr<IFunction> create_function(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
- std::shared_ptr<IMemoryManager> memory_manager)
-{
- ARM_COMPUTE_UNUSED(method);
- ARM_COMPUTE_UNUSED(a);
- ARM_COMPUTE_UNUSED(b);
- ARM_COMPUTE_UNUSED(d);
- ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_UNUSED(beta);
- ARM_COMPUTE_UNUSED(pretranspose_hint);
- ARM_COMPUTE_UNUSED(memory_manager);
- return nullptr;
-}
-
-#ifdef __aarch64__
-template <>
-std::unique_ptr<IFunction> create_function<int8_t, int32_t>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
- std::shared_ptr<IMemoryManager> memory_manager)
-{
- switch(method)
- {
- case arm_gemm::GemmMethod::GEMM_INTERLEAVED_DOT:
- {
- if(!pretranspose_hint)
- {
- return nullptr;
- }
- auto function = support::cpp14::make_unique<NEGEMMInterleavedWrapper>(memory_manager);
- function->configure(a, b, d, alpha, beta, pretranspose_hint, true /* use_dot */);
- return std::move(function);
- }
- default:
- return nullptr;
- }
- return nullptr;
-}
-
-template <>
-std::unique_ptr<IFunction> create_function<uint8_t, uint32_t>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
- std::shared_ptr<IMemoryManager> memory_manager)
-{
- switch(method)
- {
- case arm_gemm::GemmMethod::GEMM_INTERLEAVED_DOT:
- {
- if(!pretranspose_hint)
- {
- return nullptr;
- }
- auto function = support::cpp14::make_unique<NEGEMMInterleavedWrapper>(memory_manager);
- function->configure(a, b, d, alpha, beta, pretranspose_hint, true /* use_dot */);
- return std::move(function);
- }
- default:
- return nullptr;
- }
- return nullptr;
-}
-
-template <>
-std::unique_ptr<IFunction> create_function<float, float>(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
- std::shared_ptr<IMemoryManager> memory_manager)
-{
- ARM_COMPUTE_UNUSED(pretranspose_hint);
- ARM_COMPUTE_UNUSED(memory_manager);
- switch(method)
- {
+#if defined(__aarch64__)
case arm_gemm::GemmMethod::GEMM_NATIVE:
{
- auto kernel = support::cpp14::make_unique<NEGEMMNativeWrapperKernel<float, float>>();
- kernel->configure(a, b, d, alpha, beta);
- auto function = support::cpp14::make_unique<NESimpleAssemblyFunction>();
- function->configure(std::move(kernel));
- return std::move(function);
+ if(gemm_kernel_info.name.find("sgemm_native_16x4") != std::string::npos)
+ {
+ auto kernel = support::cpp14::make_unique<NEGEMMNativeWrapperKernel<float, float>>();
+ kernel->configure(a, b, d, alpha, beta);
+ auto function = support::cpp14::make_unique<NESimpleAssemblyFunction>();
+ function->configure(std::move(kernel));
+ return std::move(function);
+ }
+ return nullptr;
}
+#endif // defined(__aarch64__)
default:
return nullptr;
}
}
-#endif /* __aarch64__ */
/** Fallback in case ACL doesn't have a function */
template <typename TypeInput, typename TypeOutput>
class Fallback : public NEGEMMAssemblyDispatch::IFallback
{
public:
- void configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> &args, MemoryGroup &memory_group);
+ /** Initialise the functions's input and output.
+ *
+ * @param[in] a Input tensor containing the Matrix A.
+ * @param[in] b Input tensor containing the Matrix B.
+ * @param[out] d Output tensor to store the result of matrix multiplication.
+ * @param[in] args Matrix multiplication information.
+ * @param[in] memory_group Memory group to be used by the function.
+ */
+ void configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> args, MemoryGroup &memory_group);
+
+ // Inherited methods overridden:
void run() override;
void prepare() override;
bool is_configured() const override;
@@ -187,9 +126,16 @@
};
template <typename TypeInput, typename TypeOutput>
-void Fallback<TypeInput, TypeOutput>::configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> &args, MemoryGroup &memory_group)
+void Fallback<TypeInput, TypeOutput>::configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs<TypeOutput> args, MemoryGroup &memory_group)
{
- _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput>(args, nullptr);
+ arm_gemm::GemmConfig gemm_cfg;
+ const arm_gemm::KernelDescription gemm_kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args);
+ if(gemm_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED)
+ {
+ gemm_cfg.filter = gemm_kernel_info.name;
+ args._cfg = &gemm_cfg;
+ }
+ _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput>(args);
if(_gemm_kernel_asm == nullptr)
{
//configuration not supported: Leave function unconfigured:
@@ -199,7 +145,7 @@
// arm_compute wrapper for the Gemm object (see above)
std::unique_ptr<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>> acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>>();
ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr);
- acl_gemm_wrapper->configure(_gemm_kernel_asm.get());
+ acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter);
const size_t workspace_size = _gemm_kernel_asm->get_working_size();
if(workspace_size > 0)
{
@@ -229,8 +175,6 @@
const unsigned int alignment = 128;
const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
_pretranspose.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
- _pretranspose.allocator()->allocate();
- ARM_COMPUTE_ERROR_ON_NULLPTR(_pretranspose.buffer());
}
}
@@ -242,6 +186,7 @@
// Pretranspose B if required
if(_gemm_kernel_asm->B_pretranspose_required())
{
+ _pretranspose.allocator()->allocate();
ARM_COMPUTE_ERROR_ON(_pretranspose.buffer() == nullptr);
const int ldb = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
@@ -335,12 +280,8 @@
arm_gemm::GemmArgs<TypeOutput> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
//Try to create an ACL function:
- acl_function = create_function_all_types(arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args), a, b, d, alpha, beta, pretranspose_hint, memory_manager);
- // If the type agnostic factory failed to create an ACL function, try the specialised one:
- if(acl_function == nullptr)
- {
- acl_function = create_function<TypeInput, TypeOutput>(arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args), a, b, d, alpha, beta, pretranspose_hint, memory_manager);
- }
+ acl_function = create_function_all_types(arm_gemm::get_gemm_method<TypeInput, TypeOutput>(args), a, b, d, alpha, beta, pretranspose_hint, std::move(memory_manager));
+
//If we still don't have an ACL function:
if(acl_function == nullptr)
{
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 0232a83..be7cc2d 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -90,7 +90,7 @@
}
NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
- : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
+ : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
_add_bias_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false),
_skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
{
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 4b02694..5286f11 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -97,9 +97,9 @@
else
{
// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorInfo info_a(compute_interleaved_shape(*a->info()), 1, a->info()->data_type());
+ TensorInfo info_a = a->info()->clone()->set_tensor_shape(compute_interleaved_shape(*a->info())).set_is_resizable(true);
// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
- TensorInfo info_b(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type());
+ TensorInfo info_b = b->info()->clone()->set_tensor_shape(compute_transpose1xW_shape(*b->info())).set_is_resizable(true);
_tmp_a.allocator()->init(info_a);
_tmp_b.allocator()->init(info_b);
_memory_group.manage(&_tmp_a);
@@ -241,8 +241,8 @@
shape_tmp_b.set(0, b->dimension(1) * 16);
shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
- TensorInfo info_a(shape_tmp_a, 1, a->data_type());
- TensorInfo info_b(shape_tmp_b, 1, b->data_type());
+ TensorInfo info_a = a->clone()->set_tensor_shape(shape_tmp_a).set_is_resizable(true);
+ TensorInfo info_b = b->clone()->set_tensor_shape(shape_tmp_b).set_is_resizable(true);
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
diff --git a/src/runtime/NEON/functions/NEGather.cpp b/src/runtime/NEON/functions/NEGather.cpp
new file mode 100644
index 0000000..078bd5a
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGather.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGather.h"
+
+#include "arm_compute/core/NEON/kernels/NEGatherKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NEGather::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEGatherKernel>();
+ k->configure(input, indices, output, axis);
+ _kernel = std::move(k);
+}
+
+Status NEGather::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+{
+ return NEGatherKernel::validate(input, indices, output, axis);
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp
index fa8aaeb..8645b43 100644
--- a/src/runtime/NEON/functions/NEIntegralImage.cpp
+++ b/src/runtime/NEON/functions/NEIntegralImage.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,5 +36,5 @@
auto k = arm_compute::support::cpp14::make_unique<NEIntegralImageKernel>();
k->configure(input, output);
_kernel = std::move(k);
- _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, static_cast<float>(0.f));
+ _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
}
diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index d0b80fb..56da966 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
@@ -26,8 +26,8 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
NEL2NormalizeLayer::NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
{
@@ -57,8 +57,8 @@
ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, axis, ReductionOperation::SUM_SQUARE));
- // Reduce shape on axis (supported axis is 0)
- shape.set(0, 1);
+ // Reduce shape on axis
+ shape.set(axis, 1);
sum_sq.set_tensor_shape(shape);
ARM_COMPUTE_RETURN_ON_ERROR(NEL2NormalizeLayerKernel::validate(input, &sum_sq, output, axis, epsilon));
@@ -75,3 +75,4 @@
_memory_group.release();
}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index 7c7580a..9e7a713 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -111,8 +111,8 @@
_forget_gate_out2.allocator()->allocate();
_memory_group.manage(&_forget_gate_out5);
_accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
+ _forget_gate_out1.allocator()->allocate();
Tensor *forget_gate_out = &_forget_gate_out5;
-
if(lstm_params.has_peephole_opt())
{
_forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
@@ -129,18 +129,18 @@
{
_forget_gate_out3.allocator()->allocate();
}
- _activation_forget_gate.configure(forget_gate_out, &_forget_gate_out1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
- forget_gate_out->allocator()->allocate();
+ _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
// Configure block that calculates the input gate
// input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
// input_gate = 1 - forget_gate, with CIFG
_input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ Tensor *input_gate_out = &_input_gate_out1;
if(lstm_params.has_cifg_opt())
{
_memory_group.manage(&_input_gate_out1);
_ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
- _subtract_input_gate.configure(&_ones, &_forget_gate_out1, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _subtract_input_gate.configure(&_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
_ones.allocator()->allocate();
_run_cifg_opt = true;
}
@@ -162,16 +162,22 @@
_input_gate_out2.allocator()->allocate();
_memory_group.manage(&_input_gate_out4);
_accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
+ _input_gate_out3.allocator()->allocate();
+ input_gate_out = &_input_gate_out4;
if(_run_peephole_opt)
{
_memory_group.manage(&_input_gate_out5);
_pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _input_gate_out4.allocator()->allocate();
_input_gate_out5.allocator()->allocate();
+ input_gate_out = &_input_gate_out1;
}
- _input_gate_out3.allocator()->allocate();
- _input_gate_out4.allocator()->allocate();
- _activation_input_gate.configure(&_input_gate_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ else
+ {
+ _input_gate_out1.allocator()->allocate();
+ }
+ _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
}
// Configure block that calculates the cell state
@@ -194,11 +200,9 @@
_accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
_activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
_memory_group.manage(&_cell_state_out5);
- _pixelwise_mul_cell_state1.configure(&_cell_state_out4, &_input_gate_out1, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- _input_gate_out1.allocator()->allocate();
+ _pixelwise_mul_cell_state1.configure(&_cell_state_out4, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_cell_state_out4.allocator()->allocate();
- _pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- _forget_gate_out1.allocator()->allocate();
+ _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
_cell_state_out3.allocator()->allocate();
_cell_state_out5.allocator()->allocate();
@@ -246,7 +250,6 @@
_output1.allocator()->allocate();
}
_activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
- output_gate_out->allocator()->allocate();
// Configure block that calculates the output state
/** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -265,6 +268,7 @@
_activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
_pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_cell_state_activation.allocator()->allocate();
+ output_gate_out->allocator()->allocate();
if(lstm_params.has_projection())
{
@@ -281,19 +285,22 @@
// Copy cell state and output
_copy_cell_state.configure(&_cell_state_out1, cell_state_out);
- _cell_state_out1.allocator()->allocate();
_copy_output.configure(output_state_out, output);
// Vector for holding the tensors to store in scratch buffer
std::vector<ITensor *> scratch_inputs;
if(!lstm_params.has_cifg_opt())
{
- scratch_inputs.emplace_back(&_input_gate_out1);
+ scratch_inputs.emplace_back(input_gate_out);
}
scratch_inputs.emplace_back(&_cell_state_out1);
scratch_inputs.emplace_back(forget_gate_out);
scratch_inputs.emplace_back(output_gate_out);
_concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
+ input_gate_out->allocator()->allocate();
+ _cell_state_out1.allocator()->allocate();
+ forget_gate_out->allocator()->allocate();
+ output_gate_out->allocator()->allocate();
}
Status NELSTMLayer::validate(const ITensorInfo *input,
diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
new file mode 100644
index 0000000..f5c2718
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPadLayer.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+TensorInfo get_expected_output_tensorinfo(const ITensorInfo &input, const PaddingList &paddings)
+{
+ const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input.tensor_shape(), paddings);
+ const TensorInfo expected_output_info = input.clone()->set_tensor_shape(expected_output_shape);
+ return expected_output_info;
+}
+
+Status validate_arguments(const ITensorInfo &input, ITensorInfo &output, const PaddingList &paddings)
+{
+ const TensorInfo expected_output_info = get_expected_output_tensorinfo(input, paddings);
+ auto_init_if_empty(output, expected_output_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output, &expected_output_info);
+
+ return Status{};
+}
+
+Coordinates get_subtensor_coords(const PaddingList &paddings)
+{
+ Coordinates coords;
+ for(unsigned int i = 0; i < paddings.size(); ++i)
+ {
+ coords.set(i, paddings[i].first);
+ }
+
+ return coords;
+}
+} // namespace
+
+NEPadLayer::NEPadLayer()
+ : _memset_kernel(), _copy_kernel(), _output_subtensor()
+{
+}
+
+void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, PixelValue constant_value)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_THROW_ON_ERROR(NEPadLayer::validate(input->info(), output->info(), padding, constant_value));
+
+ // Auto-init
+ auto_init_if_empty(*output->info(), get_expected_output_tensorinfo(*input->info(), padding));
+
+ // Create SubTensor (Can use sub-tensor as the kernels to be executed do not require padding)
+ _output_subtensor = SubTensor(output, input->info()->tensor_shape(), get_subtensor_coords(padding), true);
+
+ // Set the pages of the output to the specified value
+ _memset_kernel.configure(output, constant_value);
+
+ // Copy the input to the output
+ _copy_kernel.configure(input, &_output_subtensor);
+}
+
+Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value)
+{
+ ARM_COMPUTE_UNUSED(constant_value);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+ auto output_clone = output->clone();
+
+ SubTensorInfo output_subtensor_info(output_clone.get(), input->tensor_shape(), get_subtensor_coords(padding), true);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output_clone, padding));
+ ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(input, &output_subtensor_info));
+
+ return Status{};
+}
+
+void NEPadLayer::run()
+{
+ NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
index 1f1400c..3aca4b7 100644
--- a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,14 +27,14 @@
#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
NEROIPoolingLayer::NEROIPoolingLayer()
: _roi_kernel()
{
}
-void NEROIPoolingLayer::configure(const ITensor *input, const IROIArray *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
{
_roi_kernel.configure(input, rois, output, pool_info);
}
@@ -43,3 +43,4 @@
{
NEScheduler::get().schedule(&_roi_kernel, Window::DimX);
}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NERange.cpp b/src/runtime/NEON/functions/NERange.cpp
new file mode 100644
index 0000000..977d502
--- /dev/null
+++ b/src/runtime/NEON/functions/NERange.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NERange.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NERange::NERange()
+ : _kernel()
+{
+}
+
+void NERange::configure(ITensor *output, const float start, const float end, const float step)
+{
+ _kernel.configure(output, start, end, step);
+}
+
+Status NERange::validate(const ITensorInfo *output, const float start, const float end, const float step)
+{
+ return NERangeKernel::validate(output, start, end, step);
+}
+
+void NERange::run()
+{
+ NEScheduler::get().schedule(&_kernel, Window::DimX);
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 0b022df..014895f 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -14,9 +14,9 @@
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INNEUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY NEAIM, DAMAGES OR OTHER
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
@@ -39,17 +39,38 @@
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
- for(unsigned int i = 0; i < reduction_axis.num_dimensions(); ++i)
- {
- if(output->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(reduction_axis[i]) != 1);
- ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(reduction_axis[i]) > input->num_dimensions() - 1);
- }
+ TensorShape out_shape = input->tensor_shape();
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+ const int input_dims = input->num_dimensions();
+ Coordinates axis_local = reduction_axis;
- ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, reduction_axis[i], ReductionOperation::MEAN_SUM));
+ // Convert negative axis
+ for(unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ axis_local[i] = wrap_around(axis_local[i], input_dims);
}
+ std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+ for(unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
+ if(output->total_size() > 0 && keep_dims)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+ }
+ if(keep_dims)
+ {
+ out_shape.set(axis_local[i], 1);
+ }
+ else
+ {
+ out_shape.remove_dimension(axis_local[i] - i);
+ }
+ }
+ const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
return Status{};
}
@@ -62,22 +83,32 @@
_reduced_outs = arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
_keep_dims = keep_dims;
+ Coordinates axis_local = reduction_axis;
+ const int input_dims = input->info()->num_dimensions();
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+
+ // Convert negative axis
+ for(unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ axis_local[i] = wrap_around(axis_local[i], input_dims);
+ }
+
// Perform reduction for every axis
for(unsigned int i = 0; i < _reduction_ops; ++i)
{
TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
- out_shape.set(reduction_axis[i], 1);
+ out_shape.set(axis_local[i], 1);
auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
if(i == _reduction_ops - 1 && keep_dims)
{
- _reduction_kernels[i].configure(in, output, reduction_axis[i], ReductionOperation::MEAN_SUM);
+ _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
}
else
{
- _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type()));
+ _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
_memory_group.manage(_reduced_outs.get() + i);
- _reduction_kernels[i].configure(in, _reduced_outs.get() + i, reduction_axis[i], ReductionOperation::MEAN_SUM);
+ _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], ReductionOperation::MEAN_SUM);
}
}
@@ -91,9 +122,13 @@
if(!keep_dims)
{
TensorShape out_shape = input->info()->tensor_shape();
+
+ // We have to sort the reduction axis vectors in order for remove_dimension
+ // to work properly
+ std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
for(unsigned int i = 0; i < _reduction_ops; ++i)
{
- out_shape.remove_dimension(reduction_axis[i]);
+ out_shape.remove_dimension(axis_local[i] - i);
}
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
_reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 188c2bb..9f81a40 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -50,16 +50,6 @@
ARM_COMPUTE_ERROR("Unsupported reduction axis");
}
}
-BorderMode reduction_operation_border_mode(ReductionOperation op)
-{
- switch(op)
- {
- case ReductionOperation::SUM_SQUARE:
- return BorderMode::CONSTANT;
- default:
- return BorderMode::CONSTANT;
- }
-}
} // namespace
NEReductionOperation::NEReductionOperation()
@@ -86,9 +76,9 @@
if(axis == 0)
{
// Configure fill border kernel
- BorderSize fill_border_size = (axis == 0) ? _reduction_kernel.border_size() : BorderSize();
- BorderMode fill_border_mode = reduction_operation_border_mode(op);
- _fill_border_kernel.configure(input, fill_border_size, fill_border_mode, PixelValue(static_cast<float>(0.f)));
+ const BorderSize fill_border_size = _reduction_kernel.border_size();
+ const PixelValue pixelValue = (op == ReductionOperation::PROD) ? PixelValue(1, input->info()->data_type(), input->info()->quantization_info()) : PixelValue(0, input->info()->data_type());
+ _fill_border_kernel.configure(input, fill_border_size, BorderMode::CONSTANT, pixelValue);
}
}
diff --git a/src/runtime/NEON/functions/NEReverse.cpp b/src/runtime/NEON/functions/NEReverse.cpp
new file mode 100644
index 0000000..139bd50
--- /dev/null
+++ b/src/runtime/NEON/functions/NEReverse.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReverse.h"
+
+#include "arm_compute/core/NEON/kernels/NEReverseKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor *axis)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEReverseKernel>();
+ k->configure(input, output, axis);
+ _kernel = std::move(k);
+}
+
+Status NEReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+{
+ return NEReverseKernel::validate(input, output, axis);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index a9c85bd..483aa4c 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,6 +46,11 @@
{
ARM_COMPUTE_ERROR_ON(nullptr == offsets);
ARM_COMPUTE_UNUSED(sampling_policy);
+ float sampling_offset = 0.0f;
+ if(sampling_policy == SamplingPolicy::CENTER)
+ {
+ sampling_offset = 0.5f;
+ }
Window win;
win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
@@ -60,8 +65,8 @@
execute_window_loop(win, [&](const Coordinates & id)
{
- const float in_x = (id.x() + 0.5f) * wr - 0.5f;
- const float in_y = (id.y() + 0.5f) * hr - 0.5f;
+ const float in_x = (id.x() + sampling_offset) * wr - sampling_offset;
+ const float in_y = (id.y() + sampling_offset) * hr - sampling_offset;
const int in_xi = std::floor(in_x);
const int in_yi = std::floor(in_y);
@@ -167,14 +172,14 @@
ARM_COMPUTE_ERROR("Unsupported interpolation mode");
}
- _border_handler.configure(input, _scale_kernel.border_size(), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(input, _scale_kernel.border_size(), border_mode, constant_border_value);
}
Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
+ ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER && sampling_policy != SamplingPolicy::TOP_LEFT);
ARM_COMPUTE_UNUSED(border_mode, constant_border_value);
ITensorInfo *offsets = nullptr;
diff --git a/src/runtime/NEON/functions/NESelect.cpp b/src/runtime/NEON/functions/NESelect.cpp
new file mode 100644
index 0000000..509bbaa
--- /dev/null
+++ b/src/runtime/NEON/functions/NESelect.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESelect.h"
+
+#include "arm_compute/core/NEON/kernels/NESelectKernel.h"
+#include "arm_compute/core/Types.h"
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+void NESelect::configure(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NESelectKernel>();
+ k->configure(c, x, y, output);
+ _kernel = std::move(k);
+}
+
+Status NESelect::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+{
+ return NESelectKernel::validate(c, x, y, output);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESlice.cpp b/src/runtime/NEON/functions/NESlice.cpp
new file mode 100644
index 0000000..03c2053
--- /dev/null
+++ b/src/runtime/NEON/functions/NESlice.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESlice.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NESlice::configure(const ITensor *input, ITensor *output, const Coordinates &starts, const Coordinates &ends)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ // Get absolute end coordinates
+ const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
+
+ auto k = arm_compute::support::cpp14::make_unique<NEStridedSliceKernel>();
+ k->configure(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
+ _kernel = std::move(k);
+}
+
+Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+
+ // Check start dimensions for being non-negative
+ ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
+ {
+ return i < 0;
+ }));
+
+ // Get absolute end coordinates
+ const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
+
+ return NEStridedSliceKernel::validate(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 9be9e68..36b7d47 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,54 +25,155 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "utils/TypePrinter.h"
#include <cfloat>
-using namespace arm_compute;
-
-NESoftmaxLayer::NESoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _fill_border_kernel(), _max(), _tmp()
+namespace arm_compute
{
+NESoftmaxLayer::NESoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _flat_or_reshape_kernel_ptr(nullptr), _fill_border_kernel(), _reshape_kernel(), _max(), _tmp(), _input_flattened(),
+ _output_flattened(), _needs_flattening(false)
+{
+}
+
+void NESoftmaxLayer::configure_reshape_input_kernel(const ITensor *input, const ITensor *output, size_t axis)
+{
+ // Flatten the input
+ const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis);
+
+ // Initialize the flat input
+ _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
+
+ // If we need to flatten the input, we can use NEFlattenKernel or NEReshapeKernel
+ // If flattening on the third axes, we use NEFlattenKernel.
+ // In all other cases we have to use NEReshapeKernel
+ if(axis != 3)
+ {
+ auto reshape_kernel_ptr = support::cpp14::make_unique<NEReshapeLayerKernel>();
+ reshape_kernel_ptr->configure(input, &_input_flattened);
+ _flat_or_reshape_kernel_ptr = std::move(reshape_kernel_ptr);
+ }
+ else
+ {
+ auto flatten_kernel_ptr = support::cpp14::make_unique<NEFlattenLayerKernel>();
+ flatten_kernel_ptr->configure(input, &_input_flattened);
+ _flat_or_reshape_kernel_ptr = std::move(flatten_kernel_ptr);
+ }
+
+ // We need to init the output tensor here. Indeed, the reshape kernel expects
+ // both tensors to be already initialized
+ auto_init_if_empty(*output->info(), *input->info()->clone());
}
void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta, size_t axis)
{
+ // Perform validation step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_UNUSED(axis);
+ ARM_COMPUTE_ERROR_THROW_ON(NESoftmaxLayer::validate(input->info(), output->info(), beta, axis));
- // Configure Kernels
- _max_kernel.configure(input, &_max);
- _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::REPLICATE);
- _softmax_kernel.configure(input, &_max, output, beta, &_tmp);
+ // We don't need flattening only in the case the input is 2D and axis is 1
+ _needs_flattening = axis != 1;
+
+ // If we are dealing with a 4D tensor, we will:
+ // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor
+ // - Execute all the pipeline (reduction + normalization) on the flattened tensor
+ // - Reshape the flattened output into the real output
+ if(_needs_flattening)
+ {
+ // Add to the memory manager _input_flattened
+ _memory_group.manage(&_input_flattened);
+
+ // Configure _flatten_kernel and _input_flattened
+ configure_reshape_input_kernel(input, output, axis);
+ }
+
+ // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case)
+ // or it is the original input case (2D case)
+ ITensor *input_2D = (_needs_flattening ? &_input_flattened : input);
+
+ // Create intermediate tensors shapes
+ const TensorInfo input_info = input_2D->info()->clone()->reset_padding().set_is_resizable(true);
+ DataType tmp_data_type = is_data_type_quantized_asymmetric(input_2D->info()->data_type()) ? DataType::F32 : input_2D->info()->data_type();
+ TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
// Init intermediate tensors
- _max.allocator()->init(*_max.info());
- _tmp.allocator()->init(*_tmp.info());
+ TensorShape max_sum_shape = input_2D->info()->tensor_shape();
+ max_sum_shape.set(0, 1);
+ _max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape));
+ _tmp.allocator()->init(tensor_info_tmp);
// Manage intermediate buffers
_memory_group.manage(&_max);
_memory_group.manage(&_tmp);
- // Allocate intermediate tensors
+ // Configure Kernels
+ _max_kernel.configure(input_2D, &_max);
+ if(_needs_flattening)
+ {
+ // Add to the memory manager _output_flattened
+ _memory_group.manage(&_output_flattened);
+
+ // The normalization kernel stores the result in a flat output tensor
+ _softmax_kernel.configure(input_2D, &_max, &_output_flattened, beta, &_tmp);
+ _input_flattened.allocator()->allocate();
+
+ // Reshape the flat output into the requested (4D) output
+ _reshape_kernel.configure(&_output_flattened, output);
+
+ // Allocate the intermediate flat tensors
+ _output_flattened.allocator()->allocate();
+ }
+ else
+ {
+ // Softmax 2D case
+ _fill_border_kernel.configure(input_2D, _max_kernel.border_size(), BorderMode::REPLICATE);
+ _softmax_kernel.configure(input_2D, &_max, output, beta, &_tmp);
+ }
+
+ // Allocate intermediate buffers
_max.allocator()->allocate();
_tmp.allocator()->allocate();
}
Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis must be 1 for NEON");
-
// Perform validation step
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Only 2D inputs are supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
+ ARM_COMPUTE_UNUSED(beta);
+ ARM_COMPUTE_RETURN_ERROR_ON(axis < 1 || input->num_dimensions() < axis);
- const TensorShape max_shape = TensorShape(input->tensor_shape()).set(0, 1);
- const TensorInfo tensor_info_max_sum = TensorInfo(*input).set_tensor_shape(max_shape).reset_padding();
- const TensorInfo dont_care;
+ // Create intermediate tensor info
+ DataType tmp_data_type = input->data_type();
+ const TensorInfo tensor_info_tmp(input->clone()->set_data_type(tmp_data_type).set_is_resizable(true));
+
+ TensorShape max_sum_shape = input->tensor_shape();
+ max_sum_shape.set(0, 1);
+ const TensorInfo tensor_info_max_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(input->quantization_info()).set_is_resizable(true));
+ const TensorInfo dont_care;
+
+ const bool needs_flattening = (axis != 1);
+
+ if(needs_flattening)
+ {
+ const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, axis);
+ TensorInfo tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
+
+ if(axis != 3)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(input, &tensor_info_flat));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &tensor_info_flat));
+ }
+ }
ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum));
- ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel::validate(input, &tensor_info_max_sum, output, beta, &dont_care));
+ ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel::validate(&tensor_info_tmp, &tensor_info_max_sum, output, beta, &dont_care));
return Status{};
}
@@ -81,9 +182,20 @@
{
_memory_group.acquire();
+ if(_needs_flattening)
+ {
+ NEScheduler::get().schedule(_flat_or_reshape_kernel_ptr.get(), Window::DimY);
+ }
+
NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
NEScheduler::get().schedule(&_max_kernel, Window::DimY);
NEScheduler::get().schedule(&_softmax_kernel, Window::DimY);
+ if(_needs_flattening)
+ {
+ NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
+ }
+
_memory_group.release();
}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp
new file mode 100644
index 0000000..e947657
--- /dev/null
+++ b/src/runtime/NEON/functions/NESplit.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESplit.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+NESplit::NESplit()
+ : _outputs_vector(), _slice_functions(), _num_outputs(0)
+{
+}
+
+void NESplit::configure(const ITensor *input, const std::vector<ITensor *> &outputs, unsigned int axis)
+{
+ // Create Slice functions
+ _num_outputs = outputs.size();
+ _slice_functions = arm_compute::support::cpp14::make_unique<NESlice[]>(_num_outputs);
+
+ // Get output shape
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs);
+
+ // Extract output tensor info
+ std::vector<ITensorInfo *> outputs_info;
+ for(auto &output : outputs)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ outputs_info.emplace_back(output->info());
+ }
+
+ // Validate
+ ARM_COMPUTE_ERROR_THROW_ON(NESplit::validate(input->info(), outputs_info, axis));
+
+ const size_t axis_split_step = output_shape[axis];
+ unsigned int axis_offset = 0;
+
+ // Start/End coordinates
+ Coordinates start_coords;
+ Coordinates end_coords;
+ for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+ {
+ end_coords.set(d, -1);
+ }
+
+ for(unsigned int i = 0; i < _num_outputs; i++)
+ {
+ // Update coordinate on axis
+ start_coords.set(axis, axis_offset);
+ end_coords.set(axis, axis_offset + axis_split_step);
+
+ // Configure slice function
+ _slice_functions[i].configure(input, outputs[i], start_coords, end_coords);
+
+ // Set valid region from shape
+ outputs[i]->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
+
+ // Update axis offset
+ axis_offset += axis_split_step;
+ }
+}
+
+Status NESplit::validate(const ITensorInfo *input, const std::vector<ITensorInfo *> &outputs, unsigned int axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON(axis >= input->num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON(outputs.size() < 2);
+
+ // Get output shape
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input, axis, outputs.size());
+ ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() == 0);
+
+ const size_t axis_split_step = output_shape[axis];
+ unsigned int axis_offset = 0;
+
+ // Start/End coordinates
+ Coordinates start_coords;
+ Coordinates end_coords;
+ for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+ {
+ end_coords.set(d, -1);
+ }
+
+ // Validate output tensors
+ for(const auto &output : outputs)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+ // Output auto inizialitation if not yet initialized
+ TensorInfo tmp_output_info = *output->clone();
+ auto_init_if_empty(tmp_output_info, input->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
+
+ // Update coordinate on axis
+ start_coords.set(axis, axis_offset);
+ end_coords.set(axis, axis_offset + axis_split_step);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(input, output, start_coords, end_coords));
+ axis_offset += axis_split_step;
+ }
+
+ return Status{};
+}
+
+void NESplit::run()
+{
+ for(unsigned i = 0; i < _num_outputs; ++i)
+ {
+ _slice_functions[i].run();
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp
new file mode 100644
index 0000000..2f49c22
--- /dev/null
+++ b/src/runtime/NEON/functions/NEStackLayer.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEStackLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "support/ToolchainSupport.h"
+namespace arm_compute
+{
+NEStackLayer::NEStackLayer() // NOLINT
+ : _input(),
+ _stack_kernels(),
+ _num_inputs(0)
+{
+}
+
+void NEStackLayer::configure(const std::vector<ITensor *> &input, int axis, ITensor *output)
+{
+ _num_inputs = input.size();
+ _stack_kernels = arm_compute::support::cpp14::make_unique<NEStackLayerKernel[]>(_num_inputs);
+
+ // Wrap around negative values
+ const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
+
+ for(unsigned int i = 0; i < _num_inputs; i++)
+ {
+ _stack_kernels[i].configure(input[i], axis_u, i, _num_inputs, output);
+ }
+}
+
+Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input.empty());
+
+ // Wrap around negative values
+ const size_t rank = input[0]->num_dimensions();
+ const unsigned int axis_u = wrap_around(axis, static_cast<int>(rank + 1));
+
+ const unsigned int num_inputs = input.size();
+
+ for(unsigned int i = 0; i < num_inputs; i++)
+ {
+ // All the tensors must have the same rank
+ ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
+ // Validate Kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEStackLayerKernel::validate(input[i], axis_u, i, num_inputs, output));
+ }
+
+ return Status{};
+}
+
+void NEStackLayer::run()
+{
+ for(unsigned i = 0; i < _num_inputs; i++)
+ {
+ NEScheduler::get().schedule(&_stack_kernels[i], Window::DimY);
+ }
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEStridedSlice.cpp b/src/runtime/NEON/functions/NEStridedSlice.cpp
new file mode 100644
index 0000000..53eb2b0
--- /dev/null
+++ b/src/runtime/NEON/functions/NEStridedSlice.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
+
+#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NEStridedSlice::configure(const ITensor *input, ITensor *output,
+ const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+ int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEStridedSliceKernel>();
+ k->configure(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+ _kernel = std::move(k);
+}
+
+Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+ int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+ return NEStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NETile.cpp b/src/runtime/NEON/functions/NETile.cpp
new file mode 100644
index 0000000..0ca4413
--- /dev/null
+++ b/src/runtime/NEON/functions/NETile.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NETile.h"
+
+#include "arm_compute/core/NEON/kernels/NETileKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NETile::configure(const ITensor *input, ITensor *output, const Multiples &multiples)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NETileKernel>();
+ k->configure(input, output, multiples);
+ _kernel = std::move(k);
+}
+
+Status NETile::validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples)
+{
+ return NETileKernel::validate(input, output, multiples);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp
new file mode 100644
index 0000000..7532020
--- /dev/null
+++ b/src/runtime/NEON/functions/NEUnstack.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEUnstack.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
+{
+ return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
+}
+
+inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+{
+ // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
+ Coordinates slice_end;
+ slice_start.set_num_dimensions(input_num_dimensions);
+ slice_end.set_num_dimensions(input_num_dimensions);
+ for(size_t k = 0; k < input_num_dimensions; ++k)
+ {
+ slice_start.set(k, 0);
+ slice_end.set(k, -1);
+ }
+ slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(slice_end);
+}
+} // namespace
+
+NEUnstack::NEUnstack() // NOLINT
+ : _num_slices(0),
+ _strided_slice_vector()
+{
+}
+
+void NEUnstack::configure(const ITensor *input, const std::vector<ITensor *> &output_vector, int axis)
+{
+ std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
+ std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ITensor * t)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+ return t->info();
+ });
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_ERROR_THROW_ON(NEUnstack::validate(input->info(), outputs_vector_info, axis));
+
+ // Wrap around negative values
+ const unsigned int axis_u = wrap_axis(axis, input->info());
+ _num_slices = std::min(outputs_vector_info.size(), input->info()->dimension(axis_u));
+ _strided_slice_vector = arm_compute::support::cpp14::make_unique<NEStridedSlice[]>(_num_slices);
+
+ Coordinates slice_start;
+ int32_t slice_end_mask;
+ setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
+ for(unsigned int slice = 0; slice < _num_slices; ++slice)
+ {
+ // Adjusts start and end coordinates to take a 2D slice at a time
+ slice_start.set(axis_u, slice);
+ _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+ }
+}
+
+Status NEUnstack::validate(const ITensorInfo *input, const std::vector<ITensorInfo *> &output_vector, int axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_vector.empty());
+ ARM_COMPUTE_RETURN_ERROR_ON(axis < (-static_cast<int>(input->tensor_shape().num_dimensions())));
+ ARM_COMPUTE_RETURN_ERROR_ON(axis >= static_cast<int>(input->tensor_shape().num_dimensions()));
+
+ const unsigned int num_slices = std::min(output_vector.size(), input->dimension(wrap_axis(axis, input)));
+ ARM_COMPUTE_RETURN_ERROR_ON(num_slices > input->dimension(wrap_axis(axis, input)));
+ ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size());
+
+ Coordinates slice_start;
+ int32_t slice_end_mask;
+ for(size_t k = 0; k < num_slices; ++k)
+ {
+ slice_start.set(wrap_axis(axis, input), k);
+ setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
+ ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+ }
+ return Status{};
+}
+
+void NEUnstack::run()
+{
+ for(unsigned i = 0; i < _num_slices; ++i)
+ {
+ _strided_slice_vector[i].run();
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
index 097605c..7e435c3 100644
--- a/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEWidthConcatenateLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -66,7 +66,7 @@
_num_inputs = inputs_vector.size();
std::vector<ITensorInfo *> inputs_vector_info;
- for(unsigned int i = 0; i < _num_inputs; i++)
+ for(unsigned int i = 0; i < _num_inputs; ++i)
{
inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
}
@@ -80,7 +80,7 @@
_concat_kernels_vector = arm_compute::support::cpp14::make_unique<NEWidthConcatenateLayerKernel[]>(_num_inputs);
- for(unsigned int i = 0; i < _num_inputs; i++)
+ for(unsigned int i = 0; i < _num_inputs; ++i)
{
_concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
width_offset += inputs_vector.at(i)->info()->dimension(0);
@@ -89,7 +89,7 @@
void NEWidthConcatenateLayer::run()
{
- for(unsigned i = 0; i < _num_inputs; i++)
+ for(unsigned i = 0; i < _num_inputs; ++i)
{
NEScheduler::get().schedule(_concat_kernels_vector.get() + i, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index c8e3b3b..e37f8ab 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -464,6 +464,7 @@
transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
//The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
+ _memory_group.manage(&_output_nhwc);
transform_output_kernel->configure(biases, &_output_workspace,
output_matrix_stride, &_output_nhwc,
in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
@@ -483,16 +484,16 @@
in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
}
- _weights_hwio.allocator()->allocate();
_gemm_function.configure(&_input_workspace, &_kernel_storage, nullptr, &_output_workspace, 1.0f, 0.f);
_input_workspace.allocator()->allocate();
- _kernel_storage.allocator()->allocate();
_output_workspace.allocator()->allocate();
// Reorder the convoluted output to ACL's ordering NCHW
- _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
-
- _output_nhwc.allocator()->allocate();
+ if(data_layout == DataLayout::NCHW)
+ {
+ _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
+ _output_nhwc.allocator()->allocate();
+ }
_transform_input_kernel = std::move(transform_input_kernel);
_transform_weights_kernel = std::move(transform_weights_kernel);
@@ -656,10 +657,12 @@
if(!_is_prepared)
{
// Permute weights
+ _weights_hwio.allocator()->allocate();
_permute_weights.run();
_weights->mark_as_unused();
// Transform weights
+ _kernel_storage.allocator()->allocate();
NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
_weights_hwio.allocator()->free();
diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
index c87e82a..34aaea0 100644
--- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
+++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,18 +26,159 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h"
+
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+
namespace arm_compute
{
+#ifndef NO_MULTI_THREADING
+class BufferManagerMultipleThreads final : public IBufferManager
+{
+public:
+ /** Number of buffers to ping pong between */
+ static constexpr unsigned int NUM_BUFFERS = 3;
+
+ explicit BufferManagerMultipleThreads(unsigned int max_num_users)
+ : _max_num_users(max_num_users)
+ {
+ }
+ unsigned int num_buffers() const override
+ {
+ return NUM_BUFFERS;
+ }
+ /* - Lock the requested index if it's free and return true if it needs reshaping.
+ * - Return false without acquiring the lock if the buffer at the index is already reshaped / being reshaped.
+ * - Block if the corresponding buffer for the given index is still being used by a different index.
+ */
+ bool lock_to_reshape_if_needed(unsigned int index) override
+ {
+ Buffer &buf = get_buffer_from_index(index);
+ while(true)
+ {
+ if(buf.index == index && buf.state != State::FREE)
+ {
+ //Another thread already is reshaping / has reshaped this block: nothing to do
+ return false;
+ }
+ else
+ {
+ std::unique_lock<std::mutex> lock(buf.mutex);
+ //If the buffer is free then lock it for reshaping:
+ if(buf.state == State::FREE)
+ {
+ buf.index = index;
+ buf.state = State::BEING_RESHAPED;
+ return true;
+ }
+ // Check again just in case it changed while we were acquiring the lock:
+ if(buf.index == index)
+ {
+ //Another thread is reshaping this block already, nothing to do
+ return false;
+ }
+ // buf.index != index: Buffer still being used by another block, need to wait
+ buf.sem.wait(lock);
+ }
+ }
+ }
+ /* Mark the buffer at the given index as reshaped and release the lock acquired via lock_to_reshape_if_needed() */
+ void mark_as_reshaped(unsigned int index) override
+ {
+ Buffer &buf = get_buffer_from_index(index);
+ {
+ std::lock_guard<std::mutex> lock(buf.mutex);
+ buf.users = _max_num_users;
+ buf.state = State::IN_USE;
+ }
+ buf.sem.notify_all();
+ }
+
+ /* Block until the buffer at the given index is reshaped */
+ void wait_for_reshaping(unsigned int index) override
+ {
+ Buffer &buf = get_buffer_from_index(index);
+ ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed()
+ // Check if it's already ready to use:
+ if(buf.state == State::IN_USE)
+ return;
+ std::unique_lock<std::mutex> lock(buf.mutex);
+ //Double check it didn't change while we were acquiring the lock:
+ if(buf.state == State::IN_USE)
+ return;
+ buf.sem.wait(lock);
+ }
+ /* Mark the buffer at the given index as not used by this thread anymore.
+ * Once all the threads have called this method then the buffer is marked as free again.
+ */
+ void mark_as_unused(unsigned int index) override
+ {
+ Buffer &buf = get_buffer_from_index(index);
+ ARM_COMPUTE_ERROR_ON(buf.index != index); // Should have blocked in lock_to_reshape_if_needed()
+ if(--buf.users == 0)
+ {
+ std::unique_lock<std::mutex> lock(buf.mutex);
+ buf.state = State::FREE;
+ lock.unlock();
+ buf.sem.notify_all();
+ }
+ }
+
+private:
+ enum class State
+ {
+ FREE,
+ BEING_RESHAPED,
+ IN_USE
+ };
+ struct Buffer
+ {
+ unsigned int index{};
+ std::atomic_uint users{};
+ State state{ State::FREE };
+ std::mutex mutex{};
+ std::condition_variable sem{};
+ } _buffers[NUM_BUFFERS];
+ Buffer &get_buffer_from_index(unsigned int index)
+ {
+ return _buffers[index % NUM_BUFFERS];
+ }
+ unsigned int _max_num_users;
+};
+#endif /* NO_MULTI_THREADING */
+
+class BufferManagerSingleThread : public IBufferManager
+{
+public:
+ unsigned int num_buffers() const override
+ {
+ return 1;
+ }
+ bool lock_to_reshape_if_needed(unsigned int index) override
+ {
+ return true;
+ }
+ void mark_as_reshaped(unsigned int index) override
+ {
+ }
+ void wait_for_reshaping(unsigned int index) override
+ {
+ }
+ void mark_as_unused(unsigned int index) override
+ {
+ }
+};
+
NEGEMMInterleavedWrapper::NEGEMMInterleavedWrapper(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager))
{
}
+
void NEGEMMInterleavedWrapper::run()
{
prepare();
@@ -53,6 +194,7 @@
{
if(_pretranspose_b)
{
+ _transformed_b.allocator()->allocate();
NEScheduler::get().schedule(_prepare_b.get(), Window::DimX);
_b->mark_as_unused();
}
@@ -65,12 +207,13 @@
//Maximum number of workloads to create:
const unsigned int num_threads = NEScheduler::get().num_threads();
- const unsigned int max_iterations = num_threads == 1 ? 1 : num_threads;
+ const unsigned int max_iterations = std::max(num_threads, _num_windows);
//Maximum number of iterations the parameters allow:
const unsigned int num_iterations = _batch_window.num_iterations_total();
// Keep the smallest of the two:
const unsigned int num_windows = std::min(num_iterations, max_iterations);
const TensorShape window_shape = _batch_window.shape();
+ const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX);
// Create a 1D window to dynamically split the batch window:
Window win_1D;
@@ -79,66 +222,119 @@
// Create one workload for each sub-window:
for(unsigned int w = 0; w < num_windows; w++)
{
- Window win = win_1D.split_window(0, w, num_windows);
- const Coordinates start_offset = index2coords(window_shape, win.x().start());
- const Coordinates end_offset = index2coords(window_shape, win.x().end() - 1);
- const unsigned int num_x_blocks = _block_walker.num_iterations(Window::DimX);
+ Window win = win_1D.split_window(0, w, num_windows);
+ const Coordinates start_offset = index2coords(window_shape, win.x().start());
+ const Coordinates end_offset = index2coords(window_shape, win.x().end() - 1);
- auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
+ if(_pretranspose_b)
{
- //For each block of rows in "M"
- auto workload_mm = this->_mm_workloads.begin();
- for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
+ auto workload = [start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
{
- // Transform one k_block from A:
- this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
- // Then perform the matrix multiplication for each x block along N:
- for(unsigned int i = 0; i < num_x_blocks; i++)
+ //For each block of rows in "M"
+ auto workload_mm = this->_mm_workloads.begin();
+ for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
{
- ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
- this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+ // Transform one k_block from A:
+ this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
+ // Then perform the matrix multiplication for each x block along N:
+ for(unsigned int i = 0; i < num_x_blocks; i++)
+ {
+ ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
+ this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+ }
}
- }
- };
- _workloads.push_back(workload);
+ };
+ _workloads.push_back(workload);
+ }
+ else
+ {
+ auto workload = [num_threads, start_offset, end_offset, num_x_blocks, this](const ThreadInfo & info)
+ {
+ //For each block of rows in "M"
+ auto workload_mm = this->_mm_workloads.begin();
+ unsigned int workload_b = 0;
+ //If there is only one thread then only reshape the B blocks as you need them:
+ unsigned int workload_b_next = num_threads == 1 ? this->_b_workloads.size() : 1;
+
+ for(auto workload_a = this->_a_workloads.begin(); workload_a != this->_a_workloads.end(); workload_a++)
+ {
+ // Transform one k_block from A:
+ this->_transform_a->transform(*workload_a, info, this->_batch_window, start_offset, end_offset);
+ // Then perform the matrix multiplication for each x block along N:
+ for(unsigned int i = 0; i < num_x_blocks; i++)
+ {
+ ARM_COMPUTE_ERROR_ON(workload_mm == this->_mm_workloads.end());
+ if(workload_b_next < this->_b_workloads.size())
+ {
+ //Lock on BufferManager: need to run it ?
+ if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next))
+ {
+ this->_prepare_b->transform(this->_b_workloads[workload_b_next], info);
+ this->_buffer_manager->mark_as_reshaped(workload_b_next);
+ }
+ workload_b_next++;
+ }
+ ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size());
+ // Run if needed or wait
+ if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b))
+ {
+ this->_prepare_b->transform(this->_b_workloads[workload_b], info);
+ this->_buffer_manager->mark_as_reshaped(workload_b);
+ }
+ this->_buffer_manager->wait_for_reshaping(workload_b);
+ this->_matrix_multiply->transform(*workload_mm++, info, this->_batch_window, start_offset, end_offset);
+ this->_buffer_manager->mark_as_unused(workload_b);
+ workload_b++;
+ }
+ }
+ };
+ _workloads.push_back(workload);
+ }
+ }
+ if(!_pretranspose_b && num_windows > 1 && num_windows % num_threads != 0)
+ {
+ //Make sure the number of workloads is a multiple of the number of threads to avoid dead locks:
+ for(unsigned int leftover = num_windows % num_threads; leftover != num_threads; leftover++)
+ {
+ auto workload = [this](const ThreadInfo & info)
+ {
+ unsigned int workload_b = 0;
+ //If there is only one thread then only reshape the B blocks as you need them:
+ unsigned int workload_b_next = 1;
+
+ for(unsigned int iteration = 0; iteration < this->_mm_workloads.size(); iteration++)
+ {
+ if(workload_b_next < this->_b_workloads.size())
+ {
+ //Lock on BufferManager: need to run it ?
+ if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b_next))
+ {
+ this->_prepare_b->transform(this->_b_workloads[workload_b_next], info);
+ this->_buffer_manager->mark_as_reshaped(workload_b_next);
+ }
+ workload_b_next++;
+ }
+ ARM_COMPUTE_ERROR_ON(workload_b >= this->_b_workloads.size());
+ // Run if needed or wait
+ if(this->_buffer_manager->lock_to_reshape_if_needed(workload_b))
+ {
+ this->_prepare_b->transform(this->_b_workloads[workload_b], info);
+ this->_buffer_manager->mark_as_reshaped(workload_b);
+ }
+ this->_buffer_manager->wait_for_reshaping(workload_b);
+ this->_buffer_manager->mark_as_unused(workload_b);
+ workload_b++;
+ }
+ };
+ _workloads.push_back(workload);
+ }
}
_is_prepared = true;
}
}
-namespace
-{
-// Factory to instantiate NEGEMMInterleavedPrepareBWrapperKernel:
-template <typename InputType, bool use_dot = false>
-std::unique_ptr<NEGEMMInterleavedPrepareBWrapperKernel> instantiate_prepareB(const ITensor *b, ITensor *transformed_b, const INEGEMMWrapperKernel::Params ¶ms)
-{
- auto prepare_b = support::cpp14::make_unique<NEGEMMInterleavedPrepareBWrapperKernelTemplate<InputType, use_dot>>();
- prepare_b->configure(b, transformed_b, false, NEScheduler::get().cpu_info(), params);
- return std::move(prepare_b);
-}
-
-// Factory to instantiate NEGEMMInterleavedTransformAWrapperTemplate:
-template <typename InputType, bool use_dot = false>
-std::unique_ptr<NEGEMMInterleavedTransformAWrapper> instantiate_transformA(const ITensor *a, ITensor *transformed_a, const Window &block_walker, const INEGEMMWrapperKernel::Params ¶ms)
-{
- auto transform_a = support::cpp14::make_unique<NEGEMMInterleavedTransformAWrapperTemplate<InputType, use_dot>>();
- transform_a->configure(a, transformed_a, false, block_walker, params);
- return std::move(transform_a);
-}
-
-// Factory to instantiate NEGEMMInterleavedTransformAWrapperTemplate:
-template <typename InputType, typename OutputType, bool use_dot = false>
-std::unique_ptr<NEGEMMInterleavedMatrixMultiplyWrapper> instantiate_matrix_multiply(const ITensor *transformed_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker,
- const BlockSizes &block_sizes, const INEGEMMWrapperKernel::Params ¶ms, bool pretranspose_b, float alpha, float beta)
-{
- auto matrix_multiply = support::cpp14::make_unique<NEGEMMInterleavedMatrixMultiplyWrapperTemplate<InputType, OutputType, use_dot>>();
- matrix_multiply->configure(transformed_a, transformed_b, tmp_c, c, block_walker, block_sizes, params, pretranspose_b, alpha, beta, NEScheduler::get().num_threads());
- return std::move(matrix_multiply);
-}
-} // namespace
-
-void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, bool pretranspose_b, bool use_dot)
+void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, bool pretranspose_b)
{
_params = INEGEMMWrapperKernel::extract_parameters(a, b, c);
_a = a;
@@ -146,124 +342,80 @@
_c = c;
_pretranspose_b = pretranspose_b;
- DataType input_type = a->info()->data_type();
+ const DataType input_type = a->info()->data_type();
+ const CPUInfo &ci = NEScheduler::get().cpu_info();
+ const unsigned int num_threads = NEScheduler::get().num_threads();
+
+ const arm_gemm::KernelDescription gemm_kernel_info = get_gemm_info(input_type, ci, num_threads, _params, alpha, beta, pretranspose_b);
+ ARM_COMPUTE_ERROR_ON(gemm_kernel_info.method != arm_gemm::GemmMethod::GEMM_INTERLEAVED);
// Forcing 128-byte alignment (required by 32-bit kernels)
const unsigned int alignment = 128;
_transformed_b.allocator()->init(TensorInfo{}, alignment);
_tmp_c.allocator()->init(TensorInfo{}, alignment);
- _tag = "NEGEMMInterleaved_";
- _tag += get_strategy_name(input_type, use_dot);
+ _tag = "NEGEMMInterleaved_" + gemm_kernel_info.name;
+
+ // Get strategy
+ std::unique_ptr<detail::IInterleavedStrategy> strategy = detail::create_strategy(gemm_kernel_info.name);
+ _num_windows = iceildiv(_params.M, strategy->out_height()) * _params.batches;
+ ARM_COMPUTE_ERROR_ON(strategy == nullptr);
if(!_pretranspose_b)
{
+ _block_sizes = strategy->calculate_block_sizes_for_strategy(ci, _params);
+ _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
+ _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
+ // If the execution is single threaded or has only one window then the buffer manager only needs 1 buffer else we will use NUM_BUFFERS buffers and ping pong between them:
+ const unsigned int num_iterations = _batch_window.num_iterations_total();
+ if(NEScheduler::get().num_threads() == 1 || num_iterations == 1)
+ {
+ _buffer_manager = support::cpp14::make_unique<BufferManagerSingleThread>();
+ }
+ else
+ {
+#ifdef NO_MULTI_THREADING
+ ARM_COMPUTE_ERROR("Can't have more than 1 buffer without multiple threads");
+#else /* NO_MULTI_THREADING */
+ _buffer_manager = support::cpp14::make_unique<BufferManagerMultipleThreads>(NEScheduler::get().num_threads());
+#endif /* NO_MULTI_THREADING */
+ }
// If B is transposed at every iteration then transformed_B can be managed:
_memory_group.manage(&_transformed_b);
- _block_sizes = calculate_block_sizes_from_data_type(NEScheduler::get().cpu_info(), _params.M, _params.N, _params.K, input_type, use_dot);
+ auto_init_if_empty(*_transformed_b.info(), _b->info()->clone()->set_tensor_shape(TensorShape(_block_sizes.x_block * _block_sizes.k_block, _buffer_manager->num_buffers())));
}
else
{
_tag += "_preB";
- switch(input_type)
- {
- case DataType::F32:
- _prepare_b = instantiate_prepareB<float>(_b, &_transformed_b, _params);
- break;
-#ifdef __aarch64__
- case DataType::U8:
- case DataType::QASYMM8:
- if(use_dot)
- {
- _prepare_b = instantiate_prepareB<uint8_t, true>(_b, &_transformed_b, _params);
- }
- else
- {
- _prepare_b = instantiate_prepareB<uint8_t, false>(_b, &_transformed_b, _params);
- }
- break;
- case DataType::S8:
- if(use_dot)
- {
- _prepare_b = instantiate_prepareB<int8_t, true>(_b, &_transformed_b, _params);
- }
- else
- {
- _prepare_b = instantiate_prepareB<int8_t, false>(_b, &_transformed_b, _params);
- }
- break;
-#endif /* __aarch64__ */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- _prepare_b = instantiate_prepareB<__fp16>(_b, &_transformed_b, _params);
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- default:
- ARM_COMPUTE_ERROR("DataType not supported");
- break;
- }
- ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
+ }
+ _prepare_b = strategy->instantiate_prepareB(b, &_transformed_b, _params, ci);
+ ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
+
+ if(_pretranspose_b)
+ {
_block_sizes = _prepare_b->block_sizes();
+ _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
+ _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
}
_block_walker.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_params.N, _block_sizes.x_block), _block_sizes.x_block));
_block_walker.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_params.K, _block_sizes.k_block), _block_sizes.k_block));
_block_walker.set(Window::DimZ, Window::Dimension(0, _params.multis));
- _batch_window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_block_sizes.m_round, _block_sizes.strategy_out_height), _block_sizes.strategy_out_height));
- _batch_window.set(Window::DimY, Window::Dimension(0, _params.batches));
-
_transformed_a.allocator()->init(TensorInfo(TensorShape{ _block_sizes.k_block, _block_sizes.m_round, _params.batches }, 1, input_type), alignment);
_memory_group.manage(&_transformed_a);
_memory_group.manage(&_tmp_c);
- switch(input_type)
- {
- case DataType::F32:
- _transform_a = instantiate_transformA<float>(_a, &_transformed_a, _block_walker, _params);
- _matrix_multiply = instantiate_matrix_multiply<float, float>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
- break;
-#ifdef __aarch64__
- case DataType::U8:
- case DataType::QASYMM8:
- if(use_dot)
- {
- _transform_a = instantiate_transformA<uint8_t, true>(_a, &_transformed_a, _block_walker, _params);
- _matrix_multiply = instantiate_matrix_multiply<uint8_t, uint32_t, true>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
- }
- else
- {
- _transform_a = instantiate_transformA<uint8_t, false>(_a, &_transformed_a, _block_walker, _params);
- _matrix_multiply = instantiate_matrix_multiply<uint8_t, uint32_t, false>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
- }
- break;
- case DataType::S8:
- if(use_dot)
- {
- _transform_a = instantiate_transformA<int8_t, true>(_a, &_transformed_a, _block_walker, _params);
- _matrix_multiply = instantiate_matrix_multiply<int8_t, int32_t, true>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
- }
- else
- {
- _transform_a = instantiate_transformA<int8_t, false>(_a, &_transformed_a, _block_walker, _params);
- _matrix_multiply = instantiate_matrix_multiply<int8_t, int32_t, false>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
- }
- break;
-#endif /* __aarch64__ */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- _transform_a = instantiate_transformA<__fp16>(_a, &_transformed_a, _block_walker, _params);
- _matrix_multiply = instantiate_matrix_multiply<__fp16, __fp16>(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, pretranspose_b, alpha, beta);
- break;
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- default:
- break;
- }
+ _transform_a = strategy->instantiate_transformA(_a, &_transformed_a, _block_walker, _params);
+ _matrix_multiply = strategy->instantiate_matrix_multiply(&_transformed_a, &_transformed_b, &_tmp_c, c, _block_walker, _block_sizes, _params, alpha, beta, pretranspose_b, num_threads);
ARM_COMPUTE_ERROR_ON(_transform_a == nullptr);
ARM_COMPUTE_ERROR_ON(_matrix_multiply == nullptr);
+
_transformed_a.allocator()->allocate();
_tmp_c.allocator()->allocate();
- _transformed_b.allocator()->allocate();
+ if(!_pretranspose_b)
+ {
+ _transformed_b.allocator()->allocate();
+ }
}
} // namespace arm_compute
diff --git a/src/runtime/OffsetLifetimeManager.cpp b/src/runtime/OffsetLifetimeManager.cpp
index d0b3bde..ad23220 100644
--- a/src/runtime/OffsetLifetimeManager.cpp
+++ b/src/runtime/OffsetLifetimeManager.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,8 +34,16 @@
#include <map>
#include <vector>
-using namespace arm_compute;
-
+namespace arm_compute
+{
+namespace
+{
+size_t align_offset(size_t offset, size_t alignment)
+{
+ const size_t remainder = (alignment != 0U) ? offset % alignment : 0U;
+ return (remainder != 0U) ? offset + (alignment - remainder) : offset;
+}
+} // namespace
OffsetLifetimeManager::OffsetLifetimeManager()
: _blob(0)
{
@@ -58,11 +66,15 @@
ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
// Update blob size
- size_t max_group_size = std::accumulate(std::begin(_free_blobs), std::end(_free_blobs), static_cast<size_t>(0), [](size_t s, const Blob & b)
+ size_t max_aggregated_size = 0;
+ std::for_each(std::begin(_free_blobs), std::end(_free_blobs), [&](const Blob & b)
{
- return s + b.max_size;
+ max_aggregated_size += b.max_size;
+ _blob.alignment = std::max(_blob.alignment, b.max_alignment);
});
- _blob = std::max(_blob, max_group_size);
+ max_aggregated_size += _free_blobs.size() * _blob.alignment;
+ _blob.owners = std::max(_blob.owners, _free_blobs.size());
+ _blob.size = std::max(_blob.size, max_aggregated_size);
// Calculate group mappings
auto &group_mappings = _active_group->mappings();
@@ -76,6 +88,8 @@
group_mappings[bound_element.handle] = offset;
}
offset += free_blob.max_size;
- ARM_COMPUTE_ERROR_ON(offset > _blob);
+ offset = align_offset(offset, _blob.alignment);
+ ARM_COMPUTE_ERROR_ON(offset > _blob.size);
}
}
+} // namespace arm_compute
diff --git a/src/runtime/OffsetMemoryPool.cpp b/src/runtime/OffsetMemoryPool.cpp
index 36eaf0b..70cbe90 100644
--- a/src/runtime/OffsetMemoryPool.cpp
+++ b/src/runtime/OffsetMemoryPool.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,11 +34,11 @@
using namespace arm_compute;
-OffsetMemoryPool::OffsetMemoryPool(IAllocator *allocator, size_t blob_size)
- : _allocator(allocator), _blob(), _blob_size(blob_size)
+OffsetMemoryPool::OffsetMemoryPool(IAllocator *allocator, BlobInfo blob_info)
+ : _allocator(allocator), _blob(), _blob_info(blob_info)
{
ARM_COMPUTE_ERROR_ON(!allocator);
- _blob = _allocator->make_region(blob_size, 0);
+ _blob = _allocator->make_region(blob_info.size, blob_info.alignment);
}
void OffsetMemoryPool::acquire(MemoryMappings &handles)
@@ -49,7 +49,7 @@
for(auto &handle : handles)
{
ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
- handle.first->set_owned_region(_blob->extract_subregion(handle.second, _blob_size - handle.second));
+ handle.first->set_owned_region(_blob->extract_subregion(handle.second, _blob_info.size - handle.second));
}
}
@@ -70,5 +70,5 @@
std::unique_ptr<IMemoryPool> OffsetMemoryPool::duplicate()
{
ARM_COMPUTE_ERROR_ON(!_allocator);
- return support::cpp14::make_unique<OffsetMemoryPool>(_allocator, _blob_size);
+ return support::cpp14::make_unique<OffsetMemoryPool>(_allocator, _blob_info);
}
\ No newline at end of file
diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index 5fa51d7..38edb8b 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp
@@ -138,7 +138,7 @@
}
else
{
- _associated_memory_group->finalize_memory(_owner, _memory, info().total_size());
+ _associated_memory_group->finalize_memory(_owner, _memory, info().total_size(), alignment());
}
info().set_is_resizable(false);
}