arm_compute v17.12
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index 57a1738..cdf1b54 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp
@@ -34,3 +34,8 @@
k->configure(input, output, activation_info);
_kernel = std::move(k);
}
+
+Status NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+ return NEActivationLayerKernel::validate(input, output, act_info);
+}
diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index 11f5aa7..b5dd4d0 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
@@ -36,3 +36,7 @@
k->configure(input1, input2, output, policy);
_kernel = std::move(k);
}
+Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+ return NEArithmeticAdditionKernel::validate(input1, input2, output, policy);
+}
diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
index 37586af..5c0491e 100644
--- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
@@ -36,3 +36,7 @@
k->configure(input1, input2, output, policy);
_kernel = std::move(k);
}
+Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+ return NEArithmeticSubtractionKernel::validate(input1, input2, output, policy);
+}
diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
index ef79b02..f6be001 100644
--- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
@@ -43,6 +43,12 @@
_norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
}
+Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma,
+ float epsilon)
+{
+ return NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon);
+}
+
void NEBatchNormalizationLayer::run()
{
NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NECol2Im.cpp
similarity index 68%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/runtime/NEON/functions/NECol2Im.cpp
index 37857b6..78c6bc0 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/NEON/functions/NECol2Im.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,18 +21,21 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NECol2Im.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
#include "support/ToolchainSupport.h"
-#include <utility>
-
using namespace arm_compute;
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void NECol2Im::configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims)
{
- auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
- k->configure(input, output, policy, shift);
+ auto k = arm_compute::support::cpp14::make_unique<NECol2ImKernel>();
+ k->configure(input, output, convolved_dims);
_kernel = std::move(k);
}
+
+Status NECol2Im::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims)
+{
+ return NECol2ImKernel::validate(input, output, convolved_dims);
+}
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index f34f497..25c639f 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -136,10 +136,7 @@
// Get parameters from conv_info
unsigned int stride_x = 0;
unsigned int stride_y = 0;
- unsigned int pad_x = 0;
- unsigned int pad_y = 0;
std::tie(stride_x, stride_y) = conv_info.stride();
- std::tie(pad_x, pad_y) = conv_info.pad();
// Get convolved dimensions
unsigned int conv_w = 0;
@@ -190,9 +187,17 @@
{
if(_are_weights_reshaped)
{
- const unsigned int transpose_width = 16 / input->info()->element_size();
- mat_weights_cols = weights_info.num_kernels();
- mat_weights_rows = weights->info()->dimension(0) / transpose_width + (_has_bias ? 1 : 0);
+ if(_is_fully_connected_convolution)
+ {
+ mat_weights_cols = weights_info.num_kernels();
+ mat_weights_rows = weights->info()->dimension(1);
+ }
+ else
+ {
+ const unsigned int transpose_width = 16 / input->info()->element_size();
+ mat_weights_cols = weights_info.num_kernels();
+ mat_weights_rows = weights->info()->dimension(0) / transpose_width + (_has_bias ? 1 : 0);
+ }
}
else
{
@@ -270,7 +275,7 @@
// Configure matrix multiplication kernel
if(_is_fully_connected_convolution)
{
- _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace, 1.f, 0.f, false, false);
+ _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace, 1.f, 0.f);
}
else
{
@@ -295,7 +300,7 @@
}
_input_im2col_reshaped.allocator()->allocate();
- _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+ _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
_gemm_output.allocator()->allocate();
ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
new file mode 100644
index 0000000..7b4e77b
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _scale_f(),
+ _conv_f(),
+ _scaled_output()
+{
+}
+
+void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info,
+ unsigned int ax, unsigned int ay, float upscalex, float upscaley)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) < 1);
+
+ auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
+ info.pad().first, info.pad().second, ax, ay, upscalex, upscaley, info.round());
+
+ const TensorShape output_shape = deconvolution_output_shape(out_dims, input->info()->tensor_shape(), weights->info()->tensor_shape());
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights, bias);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, weights, bias);
+
+ ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
+ ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
+ ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+
+ _memory_group.manage(&_scaled_output);
+
+ // configure scale function
+ //Init and allocate intermmidiate tensor for output, same size as input but the first two axis are the same as the output tensor
+ TensorShape scale_out_shape(input->info()->tensor_shape());
+ scale_out_shape.set(0, output->info()->dimension(0));
+ scale_out_shape.set(1, output->info()->dimension(1));
+ TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+ _scaled_output.allocator()->init(scale_out_info);
+ const unsigned int kernel_size = weights->info()->dimension(0);
+ // Padding for the upsampled image is calculated with the equiation: p' = k - p - 1, where k is kernel size and p is the input padding
+ ARM_COMPUTE_ERROR_ON(info.pad().first > (kernel_size - 1));
+ const unsigned int tr_px = kernel_size - info.pad().first - 1;
+ const unsigned int tr_py = kernel_size - info.pad().second - 1;
+ const unsigned int tr_stride = 1;
+ const PadStrideInfo transposed_info(tr_stride, tr_stride, tr_px, tr_py);
+ _scale_f.configure(input, &_scaled_output, std::make_pair(ax, ay), std::make_pair(info.stride().first - 1u, info.stride().second - 1u), transposed_info);
+ // setup the function to convolve the upscaled output
+ switch(kernel_size)
+ {
+ case 1:
+ {
+ _conv_f.configure(&_scaled_output, weights, bias, output, PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL));
+ break;
+ }
+ case 3:
+ {
+ _conv_f.configure(&_scaled_output, weights, bias, output, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
+ break;
+ }
+ case 5:
+ {
+ _conv_f.configure(&_scaled_output, weights, bias, output, PadStrideInfo(1, 1, 2, 2, DimensionRoundingType::CEIL));
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Not supported");
+ break;
+ }
+ }
+ _scaled_output.allocator()->allocate();
+}
+
+void NEDeconvolutionLayer::run()
+{
+ _memory_group.acquire();
+ _scale_f.run();
+ _conv_f.run();
+ _memory_group.release();
+}
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayerUpsample.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayerUpsample.cpp
new file mode 100644
index 0000000..79b9b2d
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayerUpsample.cpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayerUpsample.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
+
+#include <cmath>
+#include <cstddef>
+#include <utility>
+
+using namespace arm_compute;
+
+namespace
+{
+inline void precompute_offsets(ITensor *offsets, float wr, size_t input_element_size, const std::pair<unsigned int, unsigned int> &a,
+ const std::pair<unsigned int, unsigned int> &iz, const PadStrideInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON(nullptr == offsets);
+ Window win;
+ const int padx = info.pad().first;
+ const int pady = info.pad().second;
+ const int ax = a.first;
+ const int ay = a.second;
+ const int offset_width = offsets->info()->dimension(0);
+ const int offset_height = offsets->info()->dimension(1);
+ // The values of ax and ay denote the number of ZEROS to be added on the top and right inner border of the image.
+ // Step value along the XY axis will depend on the number of zeros to be inserted between samples (number of zeros + 1).
+ // Pre-compute the X offset, Y's stride is unknown at this point so we can't precompute Y's offsets
+ for(int yi = ay; yi < (offset_height - pady); yi += (1 + iz.second))
+ {
+ for(int xi = padx; xi < (offset_width - ax); xi += (1 + iz.first))
+ {
+ int *ptr = reinterpret_cast<int *>(offsets->ptr_to_element(Coordinates(xi, yi)));
+ const size_t in_xi = (xi + 0.5f) * wr;
+ *reinterpret_cast<int32_t *>(ptr) = in_xi * input_element_size;
+ }
+ }
+}
+} // namespace
+
+NEDeconvolutionLayerUpsample::NEDeconvolutionLayerUpsample(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _offsets(),
+ _border_handler(),
+ _upsample()
+{
+}
+
+void NEDeconvolutionLayerUpsample::configure(ITensor *input, ITensor *output, const std::pair<unsigned int, unsigned int> &a,
+ const std::pair<unsigned int, unsigned int> &iz, const PadStrideInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON(nullptr == input);
+ ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+ for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+ }
+
+ // Get the tensor shape
+ const TensorShape shape(output->info()->dimension(0), output->info()->dimension(1));
+
+ // Compute the ratio between source width/height and destination width/height
+ const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
+ const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
+ ARM_COMPUTE_UNUSED(hr);
+ // Get the element size of the input image
+ const size_t input_element_size = input->info()->element_size();
+
+ TensorInfo tensor_info_offsets(shape, Format::S32);
+ _offsets.allocator()->init(tensor_info_offsets);
+
+ _upsample.configure(input, &_offsets, output);
+
+ // Allocate once the configure methods have been called
+ _offsets.allocator()->allocate();
+ // Pre-compute offsets for nearest interpolation
+ std::fill_n(reinterpret_cast<int32_t *>(_offsets.buffer()), _offsets.info()->total_size() / sizeof(int32_t), -1 * input_element_size);
+ precompute_offsets(&_offsets, wr, input_element_size, a, iz, info);
+
+ _border_handler.configure(input, _upsample.border_size(), BorderMode::CONSTANT, PixelValue(0.f));
+}
+
+void NEDeconvolutionLayerUpsample::run()
+{
+ NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+ _memory_group.acquire();
+ NEScheduler::get().schedule(&_upsample, Window::DimY);
+ _memory_group.release();
+}
diff --git a/src/runtime/NEON/functions/NEDepthConcatenate.cpp b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
similarity index 89%
rename from src/runtime/NEON/functions/NEDepthConcatenate.cpp
rename to src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
index f8ad2ab..437c941 100644
--- a/src/runtime/NEON/functions/NEDepthConcatenate.cpp
+++ b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -33,7 +33,7 @@
using namespace arm_compute;
-NEDepthConcatenate::NEDepthConcatenate() // NOLINT
+NEDepthConcatenateLayer::NEDepthConcatenateLayer() // NOLINT
: _inputs_vector(),
_concat_kernels_vector(),
_border_handlers_vector(),
@@ -41,12 +41,12 @@
{
}
-void NEDepthConcatenate::configure(std::vector<ITensor *> inputs_vector, ITensor *output) // NOLINT
+void NEDepthConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output) // NOLINT
{
ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
_num_inputs = inputs_vector.size();
- _concat_kernels_vector = arm_compute::support::cpp14::make_unique<NEDepthConcatenateKernel[]>(_num_inputs);
+ _concat_kernels_vector = arm_compute::support::cpp14::make_unique<NEDepthConcatenateLayerKernel[]>(_num_inputs);
_border_handlers_vector = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector);
@@ -64,7 +64,7 @@
}
}
-void NEDepthConcatenate::run()
+void NEDepthConcatenateLayer::run()
{
for(unsigned i = 0; i < _num_inputs; ++i)
{
diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
similarity index 83%
rename from src/runtime/NEON/functions/NEDepthConvert.cpp
rename to src/runtime/NEON/functions/NEDepthConvertLayer.cpp
index 37857b6..9a75404 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
@@ -21,18 +21,18 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
#include "support/ToolchainSupport.h"
#include <utility>
using namespace arm_compute;
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void NEDepthConvertLayer::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
{
- auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
+ auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertLayerKernel>();
k->configure(input, output, policy, shift);
_kernel = std::move(k);
}
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
new file mode 100644
index 0000000..b890c6f
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
+ : _kernel(), _bias_kernel(), _border_handler(), _has_bias(false)
+{
+}
+
+void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+
+ // Call convolution kernel
+ _kernel.configure(input, weights, output, conv_info);
+ _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+ if(biases != nullptr)
+ {
+ _bias_kernel.configure(output, biases);
+ _has_bias = true;
+ }
+}
+
+void NEDepthwiseConvolutionLayer3x3::run()
+{
+ NEScheduler::get().schedule(&_border_handler, Window::DimX);
+ NEScheduler::get().schedule(&_kernel, Window::DimX);
+ if(_has_bias)
+ {
+ NEScheduler::get().schedule(&_bias_kernel, Window::DimX);
+ }
+}
+
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
+ : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _input_reshaped(), _weights_reshaped(), _v2mm_output()
+{
+}
+
+void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
+
+ const size_t weights_w = weights->info()->dimension(0);
+ const size_t weights_h = weights->info()->dimension(1);
+ const size_t weights_z = weights->info()->dimension(2);
+
+ bool has_bias = (biases != nullptr);
+
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights_w, weights_h, conv_info);
+
+ // Set up intermediate tensors
+ const size_t patch_size = weights_w * weights_h + ((has_bias) ? 1 : 0);
+ const size_t conv_size = conv_w * conv_h;
+
+ // Im2Col configuration
+ TensorShape shape_im2col = input->info()->tensor_shape();
+ shape_im2col.set(0, patch_size);
+ shape_im2col.set(1, conv_size);
+ shape_im2col.set(2, weights_z);
+ const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position());
+ _input_reshaped.allocator()->init(info_im2col);
+ _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, has_bias);
+
+ // Weights reshape configuration
+ const TensorShape shape_weights_reshape(patch_size, weights_z);
+ const TensorInfo info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
+ _weights_reshaped.allocator()->init(info_weights_reshape);
+ _weights_reshape_kernel.configure(weights, &_weights_reshaped, biases);
+
+ // GEMV configuration
+ TensorShape shape_v2mm_out = input->info()->tensor_shape();
+ shape_v2mm_out.set(0, conv_size * weights_z);
+ shape_v2mm_out.set(1, 1);
+ shape_v2mm_out.set(2, 1);
+ const TensorInfo info_v2mm_out(shape_v2mm_out, 1, input->info()->data_type(), input->info()->fixed_point_position());
+ _v2mm_output.allocator()->init(info_v2mm_out);
+ _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
+ _vector_to_tensor_kernel.configure(&_v2mm_output, output, conv_w, conv_h);
+
+ // Allocate intermediate tensors
+ _input_reshaped.allocator()->allocate();
+ _weights_reshaped.allocator()->allocate();
+ _v2mm_output.allocator()->allocate();
+}
+
+void NEDepthwiseConvolutionLayer::run()
+{
+ NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
+ NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
+ NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX);
+ NEScheduler::get().schedule(&_vector_to_tensor_kernel, Window::DimX);
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
new file mode 100644
index 0000000..d70a668
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+NEDepthwiseSeparableConvolutionLayer::NEDepthwiseSeparableConvolutionLayer()
+ : _depthwise_conv(), _pointwise_conv()
+{
+}
+
+void NEDepthwiseSeparableConvolutionLayer::configure(ITensor *input, const ITensor *depthwise_weights, const ITensor *depthwise_biases, ITensor *depthwise_out,
+ const ITensor *pointwise_weights, const ITensor *pointwise_biases, ITensor *output,
+ const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info)
+{
+ _depthwise_conv.configure(input, depthwise_weights, depthwise_biases, depthwise_out, depthwise_conv_info);
+ _pointwise_conv.configure(depthwise_out, pointwise_weights, pointwise_biases, output, pointwise_conv_info);
+}
+
+void NEDepthwiseSeparableConvolutionLayer::run()
+{
+ _depthwise_conv.run();
+ _pointwise_conv.run();
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 52a4cc1..afa5d97 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -34,7 +34,7 @@
using namespace arm_compute;
NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator()
+ : _memory_group(std::move(memory_manager)), _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator(), _has_bias(false)
{
}
@@ -46,38 +46,28 @@
_accumulator.allocator()->free();
}
+ // Check if bias should be added in the convolution result
+ _has_bias = (bias != nullptr);
+
// Allocate the intermediate accumulator tensor in case of fixed point input
- switch(output->info()->data_type())
+ if(is_data_type_fixed_point(input->info()->data_type()))
{
- case DataType::QS8:
+ const DataType promoted_dt = (input->info()->data_type() == DataType::QS8) ? DataType::QS16 : DataType::QS32;
+ _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, promoted_dt, output->info()->fixed_point_position()));
+ _memory_group.manage(&_accumulator);
+ _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+ if(_has_bias)
{
- _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
- _memory_group.manage(&_accumulator);
- _conv_kernel.configure(input, weights, &_accumulator, conv_info);
_accumulate_bias_kernel.configure(&_accumulator, bias, output);
- _accumulator.allocator()->allocate();
- break;
}
- case DataType::QS16:
+ _accumulator.allocator()->allocate();
+ }
+ else
+ {
+ _conv_kernel.configure(input, weights, output, conv_info);
+ if(_has_bias)
{
- _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS32, output->info()->fixed_point_position()));
- _memory_group.manage(&_accumulator);
- _conv_kernel.configure(input, weights, &_accumulator, conv_info);
- _accumulate_bias_kernel.configure(&_accumulator, bias, output);
- _accumulator.allocator()->allocate();
- break;
- }
- case DataType::F16:
- case DataType::F32:
- {
- _conv_kernel.configure(input, weights, output, conv_info);
_accumulate_bias_kernel.configure(output, bias);
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Data type not supported");
- break;
}
}
@@ -85,6 +75,38 @@
_input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
}
+Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+
+ DataType data_type = output->data_type();
+ if(is_data_type_fixed_point(data_type))
+ {
+ // Promote data type in case of fixed point
+ data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32);
+ }
+ TensorInfo accumulator(output->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type));
+
+ // Validate Convolution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerKernel::validate(input, weights, &accumulator, conv_info));
+
+ // Validate bias
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias == nullptr) && is_data_type_fixed_point(data_type),
+ "Biases should be provided for fixed point inputs");
+ if(bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
+ "Biases size and number of input feature maps should match");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional");
+
+ // Validate bias kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerBiasAccumulateKernel::validate(&accumulator, bias, output));
+ }
+
+ return Status{};
+}
+
void NEDirectConvolutionLayer::run()
{
NEScheduler::get().schedule(&_input_border_handler, Window::DimZ);
@@ -92,7 +114,10 @@
_memory_group.acquire();
NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
- NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+ if(_has_bias)
+ {
+ NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+ }
_memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 2e8d105..fc04e28 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -133,7 +133,7 @@
const int num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions;
const size_t linear_input_size = input->info()->tensor_shape().total_size_lower(num_input_dimensions);
- _linearize_input = input->info()->tensor_shape().x() != linear_input_size;
+ _linearize_input = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1);
_are_weights_reshaped = are_weights_reshaped;
_accumulate_biases = biases != nullptr;
_is_batched_fc_layer = num_batch_dimensions > 0;
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index ff92ef8..950f4c9 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -28,6 +28,7 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
@@ -39,6 +40,7 @@
{
#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp"
#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
} // namespace arm_compute
@@ -96,6 +98,14 @@
{
_mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
}
+ else if(a->info()->data_type() == DataType::F16 && (c == nullptr || beta == 0.f))
+ {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ _mm_optimised_kernel = support::cpp14::make_unique<NEHGEMMAArch64FP16Kernel>();
+#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ }
#endif /* defined(__arm__) || defined(__aarch64__) */
#if defined(__arm__) || defined(__aarch64__)
@@ -107,19 +117,32 @@
const int N = d->info()->tensor_shape().x();
const int K = a->info()->tensor_shape().x();
+ size_t workbench_size = 0;
+
#if defined(__arm__)
- GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
+ workbench_size = GemmInterleaved<sgemm_8x6, sgemm_8x6::operand_type, sgemm_8x6::result_type>(&ci, M, N, K, false, false).get_working_size();
#elif defined(__aarch64__)
- GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
+ if(a->info()->data_type() == DataType::F32)
+ {
+ workbench_size = GemmInterleaved<sgemm_12x8, sgemm_12x8::operand_type, sgemm_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();
+ }
+ else if(a->info()->data_type() == DataType::F16)
+ {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ workbench_size = GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type>(&ci, M, N, K, false, false).get_working_size();
+#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ }
#endif /* defined(__arm__) || defined(__aarch64__) */
constexpr size_t alignment = 4096;
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+ ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0");
+ _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8));
_memory_group.manage(&_workspace);
// Configure matrix multiplication kernel
_mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f);
-
_workspace.allocator()->allocate();
}
else
diff --git a/src/runtime/NEON/functions/NEGEMMLowp.cpp b/src/runtime/NEON/functions/NEGEMMLowp.cpp
deleted file mode 100644
index 7413b28..0000000
--- a/src/runtime/NEON/functions/NEGEMMLowp.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowp.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-
-using namespace arm_compute;
-
-NEGEMMLowp::NEGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
-{
-}
-
-void NEGEMMLowp::configure(const ITensor *a, const ITensor *b, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
- ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
- ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
-
- /* The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] */
- TensorShape shape_tmp_a = a->info()->tensor_shape();
- shape_tmp_a.set(0, a->info()->dimension(0) * 4);
- shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
-
- TensorShape shape_tmp_b = b->info()->tensor_shape();
- shape_tmp_b.set(0, b->info()->dimension(1) * 16);
- shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
-
- TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
- TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
- _tmp_a.allocator()->init(info_a);
- _tmp_b.allocator()->init(info_b);
-
- // Manage intermediate buffers
- _memory_group.manage(&_tmp_a);
- _memory_group.manage(&_tmp_b);
-
- _interleave_kernel.configure(a, &_tmp_a);
- _transpose_kernel.configure(b, &_tmp_b);
- _mm_kernel.configure(&_tmp_a, &_tmp_b, output, a_offset, b_offset, output_offset, output_mult_int, shift);
-
- _tmp_a.allocator()->allocate();
- _tmp_b.allocator()->allocate();
-}
-
-void NEGEMMLowp::run()
-{
- _memory_group.acquire();
-
- /* Run interleave kernel */
- NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
-
- /* Run transpose kernel */
- NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
-
- /* Run matrix multiply kernel */
- NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
-
- _memory_group.release();
-}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
new file mode 100644
index 0000000..6e03ffa
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
@@ -0,0 +1,216 @@
+/* Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp"
+} // namespace arm_compute
+
+using namespace arm_compute;
+
+NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(), _workspace()
+{
+}
+
+void NEGEMMLowpAssemblyMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::S8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+ ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+ ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
+ ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+
+#ifdef __aarch64__
+ const int M = output->info()->tensor_shape().y();
+ const int N = output->info()->tensor_shape().x();
+ const int K = a->info()->tensor_shape().x();
+ constexpr size_t workspace_alignment = 4096;
+ const struct CPUInfo ci = NEScheduler::get().cpu_info();
+#endif /* __aarch64__ */
+
+#ifdef ARM_COMPUTE_AARCH64_V8_2
+ if(ci.CPU == CPUTarget::A75_DOT)
+ {
+ // Configure matrix multiply kernel
+ GemmInterleaved<gemm_s8_12x8, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
+ _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+ _memory_group.manage(&_workspace);
+
+ // Configure matrix multiplication kernel
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
+ k->configure(a, b, output, &_workspace, 1.f, 1.f);
+ _mm_kernel = std::move(k);
+ _workspace.allocator()->allocate();
+ }
+ else if(ci.CPU == CPUTarget::A55_DOT)
+ {
+ ARM_COMPUTE_ERROR_ON("WIP");
+ }
+ else
+#elif defined(ARM_COMPUTE_AARCH64_V8A)
+ if(ci.CPU == CPUTarget::A53)
+ {
+ switch(a->info()->data_type())
+ {
+ case DataType::S8:
+ {
+ // Configure matrix multiply kernel
+ GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
+ _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+ }
+ break;
+ case DataType::U8:
+ {
+ // Configure matrix multiply kernel
+ GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t> gemm(&ci, M, N, K, false, false);
+ _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Datatype not supported");
+ }
+
+ _memory_group.manage(&_workspace);
+ // Configure matrix multiplication kernel
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64A53Kernel>();
+ k->configure(a, b, output, &_workspace, 1.f, 1.f);
+ _mm_kernel = std::move(k);
+ _workspace.allocator()->allocate();
+ }
+ else if(1) // Generic v8a kernel
+ {
+ switch(a->info()->data_type())
+ {
+ case DataType::S8:
+ {
+ // Configure matrix multiply kernel
+ GemmInterleaved<gemm_s8_4x4, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
+ _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+ }
+ break;
+ case DataType::U8:
+ {
+ // Configure matrix multiply kernel
+ GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t> gemm(&ci, M, N, K, false, false);
+ _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Datatype not supported");
+ }
+ _memory_group.manage(&_workspace);
+ // Configure matrix multiplication kernel
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64Kernel>();
+ k->configure(a, b, output, &_workspace, 1.f, 1.f);
+ _mm_kernel = std::move(k);
+ _workspace.allocator()->allocate();
+ }
+ else
+#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+ {
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+ TensorShape shape_tmp_a = a->info()->tensor_shape();
+ shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+ shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
+
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+ TensorShape shape_tmp_b = b->info()->tensor_shape();
+ shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+ shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
+
+ TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+ TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+ _tmp_a.allocator()->init(info_a);
+ _tmp_b.allocator()->init(info_b);
+ _memory_group.manage(&_tmp_a);
+ _memory_group.manage(&_tmp_b);
+
+ // Configure interleave kernel
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+ k->configure(a, &_tmp_a);
+ _mtx_a_reshape_kernel = std::move(k);
+ }
+
+ // Configure transpose kernel
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+ k->configure(b, &_tmp_b);
+ _mtx_b_reshape_kernel = std::move(k);
+ }
+
+ // Configure matrix multiply kernel
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+ k->configure(&_tmp_a, &_tmp_b, output);
+ _mm_kernel = std::move(k);
+ }
+
+ // Allocate tensors
+ _tmp_a.allocator()->allocate();
+ _tmp_b.allocator()->allocate();
+ }
+}
+
+void NEGEMMLowpAssemblyMatrixMultiplyCore::run()
+{
+ _memory_group.acquire();
+ if(_mtx_a_reshape_kernel)
+ {
+ NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+ }
+
+ if(_mtx_b_reshape_kernel)
+ {
+ NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+ }
+
+ NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+
+ _memory_group.release();
+}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
new file mode 100644
index 0000000..50aa5b6
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp"
+} // namespace arm_compute
+
+using namespace arm_compute;
+
+NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
+ _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _dot_product_path(false)
+{
+}
+
+void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+ ARM_COMPUTE_UNUSED(gemm_info);
+ ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
+
+ _a_offset = a->info()->quantization_info().offset;
+ _b_offset = b->info()->quantization_info().offset;
+ _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+
+#ifdef ARM_COMPUTE_AARCH64_V8_2
+ // Check for DOT product instruction
+ const struct CPUInfo ci = NEScheduler::get().cpu_info();
+ const int cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);
+
+ if(cpu_has_dotprod != 0)
+ {
+ _dot_product_path = true;
+
+ // Configure matrix multiply kernel
+ struct CPUInfo ci = NEScheduler::get().cpu_info();
+ const int M = output->info()->tensor_shape().y();
+ const int N = output->info()->tensor_shape().x();
+ const int K = a->info()->tensor_shape().x();
+
+ const size_t workbench_size = GemmInterleaved<gemm_u8_12x8, gemm_u8_12x8::operand_type, gemm_u8_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();
+ constexpr size_t alignment = 4096;
+ _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+ _memory_group.manage(&_workspace);
+
+ // Configure matrix multiplication kernel
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
+ k->configure(a, b, output, &_workspace, 1.f, 1.f);
+ _mm_kernel = std::move(k);
+ }
+ else
+#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+ {
+ if(_run_vector_matrix_multiplication)
+ {
+ // Configure matrix multiply kernel
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+ k->configure(a, b, output);
+ _mm_kernel = std::move(k);
+ }
+ }
+ else
+ {
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+ TensorShape shape_tmp_a = a->info()->tensor_shape();
+ shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+ shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
+
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+ TensorShape shape_tmp_b = b->info()->tensor_shape();
+ shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+ shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
+
+ TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+ TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+ _tmp_a.allocator()->init(info_a);
+ _tmp_b.allocator()->init(info_b);
+ _memory_group.manage(&_tmp_a);
+ _memory_group.manage(&_tmp_b);
+
+ // Configure interleave kernel
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+ k->configure(a, &_tmp_a);
+ _mtx_a_reshape_kernel = std::move(k);
+ }
+
+ // Configure transpose kernel
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+ k->configure(b, &_tmp_b);
+ _mtx_b_reshape_kernel = std::move(k);
+ }
+
+ // Configure matrix multiply kernel
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+ k->configure(&_tmp_a, &_tmp_b, output);
+ _mm_kernel = std::move(k);
+ }
+ }
+ }
+
+ // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
+ if(_a_offset != 0)
+ {
+ TensorShape shape_vector_sum_col = b->info()->tensor_shape();
+ if(b->info()->num_dimensions() > 1)
+ {
+ shape_vector_sum_col.remove_dimension(1);
+ }
+ TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32);
+ _vector_sum_col.allocator()->init(info_vector_sum_col);
+ _memory_group.manage(&_vector_sum_col);
+
+ // Configure Matrix B reduction kernel
+ _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);
+ }
+
+ // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
+ if(_b_offset != 0)
+ {
+ TensorShape shape_vector_sum_row = a->info()->tensor_shape();
+ shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1));
+ if(a->info()->num_dimensions() > 1)
+ {
+ shape_vector_sum_row.remove_dimension(1);
+ }
+ TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32);
+ _vector_sum_row.allocator()->init(info_vector_sum_row);
+ _memory_group.manage(&_vector_sum_row);
+
+ // Configure matrix A reduction kernel
+ _mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);
+ }
+
+ // Configure offset contribution kernel
+ _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+
+ // Allocate tensors
+ if(!_dot_product_path && !_run_vector_matrix_multiplication)
+ {
+ _tmp_a.allocator()->allocate();
+ _tmp_b.allocator()->allocate();
+ }
+ else
+ {
+ _workspace.allocator()->allocate();
+ }
+
+ if(_a_offset != 0)
+ {
+ _vector_sum_col.allocator()->allocate();
+ }
+
+ if(_b_offset != 0)
+ {
+ _vector_sum_row.allocator()->allocate();
+ }
+}
+
+Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
+ "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(1) != (output)->dimension(1),
+ "The output matrix must have the same number of rows as the matrix A");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0),
+ "The output matrix must have the same number of columns as the matrix B");
+ ARM_COMPUTE_UNUSED(gemm_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+ int32_t a_offset = a->quantization_info().offset;
+ int32_t b_offset = b->quantization_info().offset;
+ bool run_vector_matrix_multiplication = a->dimension(1) < 2;
+
+#ifdef ARM_COMPUTE_AARCH64_V8_2
+ // Check for DOT product instruction
+ const struct CPUInfo ci = NEScheduler::get().cpu_info();
+ const int cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);
+
+ if(cpu_has_dotprod != 0)
+ {
+ // Validate matrix multiply kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpAArch64V8P4Kernel::validate(a, b, output));
+ }
+ else
+#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+ {
+ if(!run_vector_matrix_multiplication)
+ {
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+ TensorShape shape_tmp_a = a->tensor_shape();
+ shape_tmp_a.set(0, a->dimension(0) * 4);
+ shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+ TensorShape shape_tmp_b = b->tensor_shape();
+ shape_tmp_b.set(0, b->dimension(1) * 16);
+ shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+ TensorInfo info_a(shape_tmp_a, 1, a->data_type());
+ TensorInfo info_b(shape_tmp_b, 1, b->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
+ }
+ }
+
+ TensorInfo info_vector_sum_col, info_vector_sum_row;
+
+ // Validate matrix B reduction kernel only if _a_offset is not equal to 0
+ if(a_offset != 0)
+ {
+ TensorShape shape_vector_sum_col = b->tensor_shape();
+ shape_vector_sum_col.remove_dimension(1);
+ info_vector_sum_col = TensorInfo(shape_vector_sum_col, 1, DataType::S32);
+
+ // Configure Matrix B reduction kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, a->dimension(0), false));
+ }
+
+ // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
+ if(b_offset != 0)
+ {
+ TensorShape shape_vector_sum_row = a->tensor_shape();
+ shape_vector_sum_row.set(Window::DimX, a->dimension(1));
+ shape_vector_sum_row.remove_dimension(1);
+ info_vector_sum_row = TensorInfo(shape_vector_sum_row, 1, DataType::S32);
+
+ // Configure matrix A reduction kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));
+ }
+
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,
+ a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row,
+ a_offset, b_offset));
+
+ return Status{};
+}
+
+void NEGEMMLowpMatrixMultiplyCore::run()
+{
+ _memory_group.acquire();
+
+ // Do not reshape if we run the vector-by-matrix case and we do not have the optimized gemm with dot product instruction
+ if(!_run_vector_matrix_multiplication && !_dot_product_path)
+ {
+ if(_mtx_a_reshape_kernel)
+ {
+ NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+ }
+
+ if(_mtx_b_reshape_kernel)
+ {
+ NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+ }
+ }
+
+ NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+
+ // Run matrix A reduction kernel only if _b_offset is not equal to 0
+ if(_b_offset != 0)
+ {
+ NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
+ }
+
+ // Run matrix B reduction kernel only if _a_offset is not equal to 0
+ if(_a_offset != 0)
+ {
+ NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+ }
+
+ // Run offset contribution kernel
+ NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
+
+ _memory_group.release();
+}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
new file mode 100644
index 0000000..8c02436
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void NEGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel>();
+ k->configure(input, bias, output, result_offset, result_mult_int, result_shift, min, max);
+ _kernel = std::move(k);
+}
+
+Status NEGEMMLowpQuantizeDownInt32ToUint8Scale::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+ return NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::validate(input, bias, output, min, max);
+}
+
+void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
+ int result_offset_after_shift, int min, int max)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
+ k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
+ _kernel = std::move(k);
+}
+
+Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+ return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index 84ea0ca..8a85bba 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
@@ -47,7 +47,8 @@
}
NEGaussianPyramidHalf::NEGaussianPyramidHalf() // NOLINT
- : _border_handler(),
+ : _horizontal_border_handler(),
+ _vertical_border_handler(),
_horizontal_reduction(),
_vertical_reduction()
{
@@ -62,6 +63,9 @@
ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale());
+ // Constant value to use for vertical fill border when the border mode is CONSTANT
+ const uint16_t pixel_value_u16 = static_cast<uint16_t>(constant_border_value) * 2 + static_cast<uint16_t>(constant_border_value) * 8 + static_cast<uint16_t>(constant_border_value) * 6;
+
/* Get number of pyramid levels */
const size_t num_levels = pyramid->info()->num_levels();
@@ -70,9 +74,10 @@
if(num_levels > 1)
{
- _border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
- _horizontal_reduction = arm_compute::support::cpp14::make_unique<NEGaussianPyramidHorKernel[]>(num_levels - 1);
- _vertical_reduction = arm_compute::support::cpp14::make_unique<NEGaussianPyramidVertKernel[]>(num_levels - 1);
+ _horizontal_border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
+ _vertical_border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
+ _horizontal_reduction = arm_compute::support::cpp14::make_unique<NEGaussianPyramidHorKernel[]>(num_levels - 1);
+ _vertical_reduction = arm_compute::support::cpp14::make_unique<NEGaussianPyramidVertKernel[]>(num_levels - 1);
// Apply half scale to the X dimension of the tensor shape
TensorShape tensor_shape = pyramid->info()->tensor_shape();
@@ -84,13 +89,16 @@
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
/* Configure horizontal kernel */
- _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode == BorderMode::UNDEFINED);
+ _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
/* Configure vertical kernel */
- _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), border_mode == BorderMode::UNDEFINED);
+ _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
/* Configure border */
- _border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+ _horizontal_border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+
+ /* Configure border */
+ _vertical_border_handler[i].configure(_tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
}
_tmp.allocate();
@@ -109,8 +117,9 @@
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
- NEScheduler::get().schedule(_border_handler.get() + i, Window::DimZ);
+ NEScheduler::get().schedule(_horizontal_border_handler.get() + i, Window::DimZ);
NEScheduler::get().schedule(_horizontal_reduction.get() + i, Window::DimY);
+ NEScheduler::get().schedule(_vertical_border_handler.get() + i, Window::DimZ);
NEScheduler::get().schedule(_vertical_reduction.get() + i, Window::DimY);
}
}
diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEIm2Col.cpp
similarity index 64%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/runtime/NEON/functions/NEIm2Col.cpp
index 37857b6..8e90e66 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/NEON/functions/NEIm2Col.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,18 +21,21 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEIm2Col.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
#include "support/ToolchainSupport.h"
-#include <utility>
-
using namespace arm_compute;
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
{
- auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
- k->configure(input, output, policy, shift);
+ auto k = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
+ k->configure(input, output, kernel_dims, conv_info, has_bias);
_kernel = std::move(k);
}
+
+Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+ return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias);
+}
diff --git a/src/runtime/NEON/functions/NEL2Normalize.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
similarity index 86%
rename from src/runtime/NEON/functions/NEL2Normalize.cpp
rename to src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index 349a781..fa62483 100644
--- a/src/runtime/NEON/functions/NEL2Normalize.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
@@ -21,19 +21,19 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/runtime/NEON/functions/NEL2Normalize.h"
+#include "arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
using namespace arm_compute;
-NEL2Normalize::NEL2Normalize(std::shared_ptr<IMemoryManager> memory_manager)
+NEL2NormalizeLayer::NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
{
}
-void NEL2Normalize::configure(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
+void NEL2NormalizeLayer::configure(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
{
// Manage intermediate buffers
_memory_group.manage(&_sumsq);
@@ -46,7 +46,7 @@
_sumsq.allocator()->allocate();
}
-void NEL2Normalize::run()
+void NEL2NormalizeLayer::run()
{
_memory_group.acquire();
diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
index a680f1f..0e149d4 100644
--- a/src/runtime/NEON/functions/NELaplacianPyramid.cpp
+++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
@@ -28,7 +28,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
#include "arm_compute/runtime/Tensor.h"
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index cb48598..b29b796 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -111,7 +111,7 @@
_input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
_weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
_mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
- _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+ _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
// Allocate intermediate tensors
_weights_reshaped.allocator()->allocate();
diff --git a/src/runtime/NEON/functions/NEMagnitude.cpp b/src/runtime/NEON/functions/NEMagnitude.cpp
index 7877995..f865054 100644
--- a/src/runtime/NEON/functions/NEMagnitude.cpp
+++ b/src/runtime/NEON/functions/NEMagnitude.cpp
@@ -31,18 +31,36 @@
using namespace arm_compute;
-void NEMagnitude::configure(const ITensor *input1, const ITensor *input2, ITensor *output, bool use_fp16)
+void NEMagnitude::configure(const ITensor *input1, const ITensor *input2, ITensor *output, MagnitudeType mag_type, bool use_fp16)
{
if(use_fp16)
{
- auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
- k->configure(input1, input2, output, nullptr);
- _kernel = std::move(k);
+ if(mag_type == MagnitudeType::L1NORM)
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::SIGNED>>();
+ k->configure(input1, input2, output, nullptr);
+ _kernel = std::move(k);
+ }
+ else
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+ k->configure(input1, input2, output, nullptr);
+ _kernel = std::move(k);
+ }
}
else
{
- auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
- k->configure(input1, input2, output, nullptr);
- _kernel = std::move(k);
+ if(mag_type == MagnitudeType::L1NORM)
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::SIGNED>>();
+ k->configure(input1, input2, output, nullptr);
+ _kernel = std::move(k);
+ }
+ else
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+ k->configure(input1, input2, output, nullptr);
+ _kernel = std::move(k);
+ }
}
}
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index e01ef66..af98ac1 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -37,9 +37,9 @@
{
}
-void NENormalizationLayer::configure(const ITensor *input, ITensor *output, NormalizationLayerInfo norm_info)
+void NENormalizationLayer::configure(const ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info)
{
- ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
_input_squared.allocator()->init(tensor_info);
@@ -56,6 +56,17 @@
_input_squared.allocator()->allocate();
}
+Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+{
+ // Perform validation step
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NENormalizationLayerKernel::validate(input, input, output, norm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+
+ return Status{};
+}
+
void NENormalizationLayer::run()
{
_memory_group.acquire();
diff --git a/src/runtime/NEON/functions/NEPhase.cpp b/src/runtime/NEON/functions/NEPhase.cpp
index 436d22f..6392281 100644
--- a/src/runtime/NEON/functions/NEPhase.cpp
+++ b/src/runtime/NEON/functions/NEPhase.cpp
@@ -30,9 +30,18 @@
using namespace arm_compute;
-void NEPhase::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+void NEPhase::configure(const ITensor *input1, const ITensor *input2, ITensor *output, PhaseType phase_type)
{
- auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
- k->configure(input1, input2, nullptr, output);
- _kernel = std::move(k);
+ if(phase_type == PhaseType::UNSIGNED)
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>>();
+ k->configure(input1, input2, nullptr, output);
+ _kernel = std::move(k);
+ }
+ else
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+ k->configure(input1, input2, nullptr, output);
+ _kernel = std::move(k);
+ }
}
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index 2e2ea11..5a474e4 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -36,3 +36,7 @@
k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
_kernel = std::move(k);
}
+Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+ return NEPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
+}
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index f8a85b9..530c7fc 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -48,6 +48,11 @@
_border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, PixelValue(static_cast<float>(0.f)));
}
+Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+{
+ return NEPoolingLayerKernel::validate(input, output, pool_info);
+}
+
void NEPoolingLayer::run()
{
// Fill border
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index bbd3fac..bd565c9 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -42,9 +42,11 @@
namespace
{
-void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size)
+void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size, SamplingPolicy sampling_policy)
{
ARM_COMPUTE_ERROR_ON(nullptr == offsets);
+ ARM_COMPUTE_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
+ ARM_COMPUTE_UNUSED(sampling_policy);
Window win;
win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
@@ -95,7 +97,7 @@
{
}
-void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
+void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
{
ARM_COMPUTE_ERROR_ON(nullptr == input);
ARM_COMPUTE_ERROR_ON(nullptr == output);
@@ -131,13 +133,13 @@
TensorInfo tensor_info_offsets(shape, Format::S32);
_offsets.allocator()->init(tensor_info_offsets);
- _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined);
+ _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined, sampling_policy);
// Allocate once the configure methods have been called
_offsets.allocator()->allocate();
// Pre-compute offsets for nearest interpolation
- precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size);
+ precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size, sampling_policy);
break;
}
case InterpolationPolicy::BILINEAR:
@@ -149,7 +151,7 @@
_dx.allocator()->init(tensor_info_dxdy);
_dy.allocator()->init(tensor_info_dxdy);
- _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined);
+ _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined, sampling_policy);
// Allocate once the configure methods have been called
_offsets.allocator()->allocate();
@@ -157,7 +159,7 @@
_dy.allocator()->allocate();
// Pre-compute dx, dy and offsets for bilinear interpolation
- precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size);
+ precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size, sampling_policy);
break;
}
case InterpolationPolicy::AREA:
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index cc5d4e9..8e6773c 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -36,9 +36,9 @@
{
}
-void NESoftmaxLayer::configure(ITensor *input, ITensor *output)
+void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Create intermediate tensors shapes
TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
@@ -57,7 +57,7 @@
// Configure Kernels
_max_kernel.configure(input, &_max);
- _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
+ _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
_norm_kernel.configure(&_tmp, &_sum, output);
_fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::REPLICATE);
@@ -67,6 +67,23 @@
_sum.allocator()->allocate();
}
+Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta)
+{
+ // Perform validation step
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+ TensorShape max_sum_shape = input->tensor_shape();
+ max_sum_shape.set(0, 1);
+
+ TensorInfo tensor_info_max_sum(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(max_sum_shape));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum));
+ ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DShiftExpSumKernel::validate(input, &tensor_info_max_sum, input, &tensor_info_max_sum, beta));
+ ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DNormKernel::validate(input, &tensor_info_max_sum, output));
+
+ return Status{};
+}
+
void NESoftmaxLayer::run()
{
_memory_group.acquire();
diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp
index eb81e02..b5b28e8 100644
--- a/src/runtime/NEON/functions/NETranspose.cpp
+++ b/src/runtime/NEON/functions/NETranspose.cpp
@@ -36,3 +36,8 @@
k->configure(input, output);
_kernel = std::move(k);
}
+
+Status NETranspose::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return NETransposeKernel::validate(input, output);
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp
new file mode 100644
index 0000000..3251de4
--- /dev/null
+++ b/src/runtime/NEON/functions/NEWinogradLayer.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEWinogradLayer.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+namespace
+{
+inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
+{
+ const int in_width = input->info()->dimension(0);
+ const int in_height = input->info()->dimension(1);
+ const int in_batches = input->info()->dimension(3);
+ const int in_channels = input->info()->dimension(2);
+ return Tensor4DShape({ in_batches, in_height, in_width, in_channels });
+}
+} /* namespace */
+
+namespace arm_compute
+{
+NEWinogradLayer::NEWinogradLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _winograd_kernel(), _weights_workspace(), _workspace(), _kernel_storage(), _input(), _weights(), _output(), _reshaped_kernel(false), _conv()
+{
+} /* arm_compute */
+
+void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(1) != 3 || weights->info()->dimension(0) != 3, "Only 3x3 kernels are supported");
+ ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+ }
+
+ _weights = weights;
+ _input = input;
+ _output = output;
+
+ // Get parameters from conv_info
+ unsigned int stride_x = 0;
+ unsigned int stride_y = 0;
+ std::tie(stride_x, stride_y) = conv_info.stride();
+ ARM_COMPUTE_ERROR_ON_MSG(stride_y != 1 || stride_x != 1, "Winograd layer only supports unit strides.");
+
+ // Get convolved dimensions
+ auto padding = PADDING_VALID;
+ const int in_channels = input->info()->dimension(2);
+
+ const int out_channels = output->info()->dimension(2);
+ const int weights_width = weights->info()->dimension(0);
+ const int weights_height = weights->info()->dimension(1);
+
+ const KernelShape kernel_shape({ out_channels, weights_height, weights_width, in_channels });
+ const Tensor4DShape in_shape(internal_get_input_shape(input));
+
+ // Get the memory required to instantiate a new Winograd operator.
+ constexpr size_t kstore_alignment = 64;
+ const size_t kernel_storage_per_thread = NEWinogradLayerKernel::get_kernel_storage_size(kernel_shape);
+ _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_per_thread + kstore_alignment - 1) }, 1, DataType::U8));
+ _memory_group.manage(&_kernel_storage);
+
+ // Get workbench size and allocate memory
+ constexpr size_t wspace_alignment = 64;
+ const size_t ws_size = NEWinogradLayerKernel::get_working_space_size(in_shape, kernel_shape, padding);
+ _workspace.allocator()->init(TensorInfo(TensorShape{ (ws_size + wspace_alignment - 1) }, 1, DataType::U8));
+ _memory_group.manage(&_workspace);
+
+ // Workspace for weights transform
+ const size_t weights_transform_size = NEWinogradLayerKernel::get_kernel_transform_working_size(kernel_shape);
+ _weights_workspace.allocator()->init(TensorInfo(TensorShape{ (weights_transform_size + wspace_alignment - 1) }, 1, DataType::U8));
+ _memory_group.manage(&_weights_workspace);
+
+ _kernel_storage.allocator()->allocate();
+ _workspace.allocator()->allocate();
+ _weights_workspace.allocator()->allocate();
+
+ // Create Winograd operator object
+ _conv = support::cpp14::make_unique<Winograd3x3F32>(kernel_shape, in_shape, padding, _kernel_storage.buffer());
+
+ // Configure the kernel, padding not needed so it's safe to call configure after allocare
+ _winograd_kernel.configure(output, _conv.get());
+}
+
+void NEWinogradLayer::run()
+{
+#if defined(__aarch64__)
+ _memory_group.acquire();
+ if(!_reshaped_kernel)
+ {
+ _conv->transform_weights(reinterpret_cast<const float *>(_weights->buffer()), reinterpret_cast<float *>(_weights_workspace.buffer()));
+ _reshaped_kernel = true;
+ }
+ const Tensor4DShape in_shape(internal_get_input_shape(_input));
+ auto padding = PADDING_VALID;
+
+ //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
+ _conv->nchw2nhwc(in_shape, padding, _workspace.buffer(), reinterpret_cast<const float *>(_input->buffer()));
+
+ //Get ptrs into the workspace
+ std::pair<void *, void *> nhwc_ptrs = _conv->get_nhwc_ptrs(in_shape, padding, _workspace.buffer());
+
+ //Setup matrices ptrs and transfor the input tensor to the appropriate form before running GEMM.
+ _conv->reshape_input(in_shape, padding, nhwc_ptrs.second, _workspace.buffer());
+
+ //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
+ NEScheduler::get().schedule(&_winograd_kernel, Window::DimY);
+
+ //Transform the output to the appropriate form
+ _conv->reshape_output(in_shape, padding, nhwc_ptrs.first);
+
+ //Transform back to NCHW
+ _conv->nhwc2nchw(in_shape, padding, _workspace.buffer(), reinterpret_cast<float *>(_output->buffer()));
+
+ _memory_group.release();
+#else /* __aarch64__ */
+ ARM_COMPUTE_UNUSED(_winograd_kernel);
+ ARM_COMPUTE_UNUSED(_workspace);
+ ARM_COMPUTE_UNUSED(_kernel_storage);
+ ARM_COMPUTE_UNUSED(_input);
+ ARM_COMPUTE_UNUSED(_weights);
+ ARM_COMPUTE_UNUSED(_output);
+ ARM_COMPUTE_UNUSED(_reshaped_kernel);
+ ARM_COMPUTE_UNUSED(_conv);
+ ARM_COMPUTE_ERROR("Winograd only supported for aarch64, recompile with arch=arm64-v8a.");
+#endif /* __aarch64__ */
+}
+} // namespace arm_compute