arm_compute v17.04
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index 632e470..2d7ad86 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -62,7 +62,7 @@
TensorInfo gradient_info;
TensorInfo magnitude_info;
- /* Initialize images */
+ // Initialize images
if(gradient_size < 7)
{
gradient_info.init(shape, Format::S16);
@@ -82,7 +82,7 @@
_phase.allocator()->init(info);
_nonmax.allocator()->init(info);
- /* Configure/Init sobelNxN */
+ // Configure/Init sobelNxN
if(gradient_size == 3)
{
auto k = arm_compute::cpp14::make_unique<NESobel3x3>();
@@ -106,7 +106,7 @@
ARM_COMPUTE_ERROR("Gradient size not supported\n");
}
- /* Configure gradient */
+ // Configure gradient
if(use_fp16)
{
auto k = arm_compute::cpp14::make_unique<NEGradientFP16Kernel>();
@@ -120,28 +120,24 @@
_gradient = std::move(k);
}
- _gx.allocator()->allocate();
- _gy.allocator()->allocate();
-
- /* Configure non-maxima suppression */
+ // Configure non-maxima suppression
_non_max_suppr.configure(&_magnitude, &_phase, &_nonmax, upper_thr, lower_thr, border_mode == BorderMode::UNDEFINED);
- _phase.allocator()->allocate();
+ // Fill border around magnitude image as non-maxima suppression will access
+ // it. If border mode is undefined filling the border is a nop.
+ _border_mag_gradient.configure(&_magnitude, _non_max_suppr.border_size(), border_mode, constant_border_value);
- if(border_mode != BorderMode::UNDEFINED)
- {
- /* Configure border filling for magnitude image */
- _border_mag_gradient.configure(&_magnitude, _non_max_suppr.border_size(), BorderMode::CONSTANT, 0);
- }
-
- _magnitude.allocator()->allocate();
-
- /* Configure edge tracing */
+ // Configure edge tracing
_edge_trace.configure(&_nonmax, output);
// Fill border with "No edge" to stop recursion in edge trace
_border_edge_trace.configure(&_nonmax, _edge_trace.border_size(), BorderMode::CONSTANT, 0);
+ // Allocate intermediate tensors
+ _gx.allocator()->allocate();
+ _gy.allocator()->allocate();
+ _phase.allocator()->allocate();
+ _magnitude.allocator()->allocate();
_nonmax.allocator()->allocate();
}
@@ -150,16 +146,16 @@
ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
ARM_COMPUTE_ERROR_ON(_output == nullptr);
- /* Run sobelNxN */
+ // Run sobelNxN
_sobel->run();
- /* Run gradient */
- NEScheduler::get().multithread(_gradient.get());
-
- /* Fill border before non-maxima suppression */
+ // Fill border before non-maxima suppression. Nop for border mode undefined.
_border_mag_gradient.run(_border_mag_gradient.window());
- /* Run non-maxima suppression */
+ // Run gradient
+ NEScheduler::get().multithread(_gradient.get());
+
+ // Run non-maxima suppression
NEScheduler::get().multithread(&_non_max_suppr);
ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
@@ -168,6 +164,6 @@
// Fill border before edge trace
_border_edge_trace.run(_border_edge_trace.window());
- /* Run edge tracing */
+ // Run edge tracing
_edge_trace.run(_edge_trace.window());
}
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 89ec900..27eb4bc 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -35,7 +35,7 @@
NEConvolutionLayer::NEConvolutionLayer()
: _input_im2col_kernel(), _input_interleave_kernel(), _weights_reshape_kernel(), _weights_transposed_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(),
- _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _is_first_run(false), _has_bias(false), _is_fc(false)
+ _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _is_first_run(false), _has_bias(false)
{
}
@@ -64,22 +64,12 @@
std::tie(stride_x, stride_y) = conv_info.stride();
std::tie(pad_x, pad_y) = conv_info.pad();
- bool is_same_dimension = true;
- // Make sure the input and weights have same low three dimensions
- for(int i = 0; i < 3; i++)
- {
- is_same_dimension = (is_same_dimension) && (input->info()->dimension(i) == weights->info()->dimension(i));
- }
-
- // Run the fully connected path if is_same_dimension is true and conv_stride_x/conv_stride_y are 1, and conv_pad_x/conv_pad_y are 0 and skip col2im
- _is_fc = (is_same_dimension) && ((stride_x & stride_y) == 1) && ((pad_x | pad_y) == 0);
-
// Get convolved dimensions
unsigned int conv_w = 0;
unsigned int conv_h = 0;
-
std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
stride_x, stride_y, pad_x, pad_y, conv_info.round());
+ ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
// Create tensor to store the reshaped weights
const size_t mat_weights_cols = weights->info()->dimension(3);
@@ -95,15 +85,11 @@
// Create tensor to store im2col reshaped inputs
const size_t mat_input_cols = mat_weights_rows;
- const size_t mat_input_rows = _is_fc ? (input->info()->dimension(3)) : (conv_w * conv_h);
+ const size_t mat_input_rows = conv_w * conv_h;
TensorShape shape_im2col = input->info()->tensor_shape();
shape_im2col.set(0, mat_input_cols);
shape_im2col.set(1, mat_input_rows);
shape_im2col.set(2, 1);
- if(_is_fc)
- {
- shape_im2col.set(3, 1);
- }
TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type());
_input_im2col_reshaped.allocator()->init(info_im2col);
@@ -126,16 +112,8 @@
_input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
_weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
_weights_transposed_kernel.configure(&_weights_reshaped, &_weights_transposed);
-
- if(_is_fc)
- {
- _mm_kernel.configure(&_input_interleaved_reshaped, &_weights_transposed, output, 1.0f);
- }
- else
- {
- _mm_kernel.configure(&_input_interleaved_reshaped, &_weights_transposed, &_gemm_output, 1.0f);
- _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
- }
+ _mm_kernel.configure(&_input_interleaved_reshaped, &_weights_transposed, &_gemm_output, 1.0f);
+ _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
// Allocate the tensors once the all configure methods have been called
_weights_reshaped.allocator()->allocate();
@@ -165,8 +143,5 @@
NEScheduler::get().multithread(&_mm_kernel);
// Reshape output matrix
- if(!_is_fc)
- {
- NEScheduler::get().multithread(&_output_col2im_kernel);
- }
+ NEScheduler::get().multithread(&_output_col2im_kernel);
}
diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
index e67e4d6..670b4d4 100644
--- a/src/runtime/NEON/functions/NEFastCorners.cpp
+++ b/src/runtime/NEON/functions/NEFastCorners.cpp
@@ -40,14 +40,13 @@
_border_handler(),
_nonmax_kernel(),
_fill_kernel(),
- _out_border_handler_kernel(),
_output(),
_suppressed(),
_non_max(false)
{
}
-void NEFastCorners::configure(IImage *input, float threshold, bool nonmax_suppression, KeyPointArray *const corners,
+void NEFastCorners::configure(IImage *input, float threshold, bool nonmax_suppression, KeyPointArray *corners,
BorderMode border_mode, uint8_t constant_border_value)
{
ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
@@ -56,19 +55,18 @@
ARM_COMPUTE_ERROR_ON(nullptr == corners);
ARM_COMPUTE_ERROR_ON(threshold < 1 && threshold > 255);
+ _non_max = nonmax_suppression;
+
TensorInfo tensor_info(input->info()->tensor_shape(), Format::U8);
_output.allocator()->init(tensor_info);
- _border_handler.configure(input, _fast_corners_kernel.border_size(), border_mode, constant_border_value);
- /*
- If border is UNDEFINED _fast_corners_kernel will operate in xwindow (3, width - 3) and ywindow (3, height -3) so
- the output image will leave the pixels on the borders unchanged. This can cause problems if Non Max Suppression is performed afterwards.
- If non max sup is true && border == UNDEFINED we must set the border texels to 0 before executing the non max sup kernel
- */
+ // If border is UNDEFINED _fast_corners_kernel will operate in xwindow (3,
+ // width - 3) and ywindow (3, height -3) so the output image will leave the
+ // pixels on the borders unchanged. This is reflected in the valid region
+ // of the output. The non maxima suppression is only run on the valid
+ // pixels.
_fast_corners_kernel.configure(input, &_output, threshold, nonmax_suppression, BorderMode::UNDEFINED == border_mode);
-
- _output.allocator()->allocate();
- _non_max = nonmax_suppression;
+ _border_handler.configure(input, _fast_corners_kernel.border_size(), border_mode, constant_border_value);
if(!_non_max)
{
@@ -76,26 +74,26 @@
}
else
{
- if(border_mode == BorderMode::UNDEFINED)
- {
- // We use this kernel to set the borders to 0 before performing non max sup
- _out_border_handler_kernel.configure(&_output, _fast_corners_kernel.border_size(), PixelValue(static_cast<uint8_t>(0)));
- }
-
_suppressed.allocator()->init(tensor_info);
- _suppressed.allocator()->allocate();
_nonmax_kernel.configure(&_output, &_suppressed, BorderMode::UNDEFINED == border_mode);
_fill_kernel.configure(&_suppressed, 1 /* we keep all texels >0 */, corners);
+
+ // Allocate intermediate tensors
+ _suppressed.allocator()->allocate();
}
+
+ // Allocate intermediate tensors
+ _output.allocator()->allocate();
}
void NEFastCorners::run()
{
+ _border_handler.run(_border_handler.window());
+
NEScheduler::get().multithread(&_fast_corners_kernel);
if(_non_max)
{
- NEScheduler::get().multithread(&_out_border_handler_kernel); // make sure inner borders are set to 0 before running non max sup kernel
NEScheduler::get().multithread(&_nonmax_kernel);
}
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index c6ef6c6..e6785b3 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -26,69 +26,234 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include <algorithm>
+#include <cmath>
+
using namespace arm_compute;
NEFullyConnectedLayer::NEFullyConnectedLayer()
- : _conv_function(), _gemm_function(), _transpose_kernel(), _acc_biases_kernel(), _run_func(), _weights_transposed(), _is_first_run(true), _run_acc_biases(false)
+ : _im2col_kernel(), _transpose_kernel(), _transpose1xW_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _transpose_output(),
+ _transpose1xW_output(), _is_first_run(true), _transpose_weights(true), _fc_after_conv(false), _batched_fc_layer(false), _accumulate_biases(false)
{
}
-void NEFullyConnectedLayer::configure(ITensor *input, ITensor *weights, const ITensor *biases, ITensor *output)
+void NEFullyConnectedLayer::configure_conv_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON((weights->info()->num_dimensions() != 2) && (weights->info()->num_dimensions() != 4));
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
- // Make sure that in the fully connected layer connected to fully connected layer case, the first dimension of the weights and input are same.
- ARM_COMPUTE_ERROR_ON((weights->info()->num_dimensions() == 2) && (input->info()->dimension(0) != weights->info()->dimension(0)));
+ // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
- if(weights->info()->num_dimensions() != 2)
- {
- _conv_function.configure(input, weights, biases, output, PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::FLOOR));
- _run_func = &NEFullyConnectedLayer::run_conv;
- return;
- }
+ // Initialize output tensor for im2col
+ TensorShape shape_im2col;
+ shape_im2col.set(0, weights->info()->dimension(1));
+ shape_im2col.set(1, input->info()->dimension(3));
+ shape_im2col.set(2, input->info()->dimension(4));
+ shape_im2col.set(3, input->info()->dimension(5));
+ _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
- TensorShape shape_trans(weights->info()->dimension(1), weights->info()->dimension(0));
- TensorInfo tensor_info(shape_trans, 1, weights->info()->data_type());
- _weights_transposed.allocator()->init(tensor_info);
+ // Initialize output tensor for interleave 4x4
+ TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
+ shape_interleaved.set(0, shape_interleaved.x() * 4);
+ shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
+ _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
- _transpose_kernel.configure(weights, &_weights_transposed);
- _gemm_function.configure(input, &_weights_transposed, nullptr, output, 1.0f, 0.0f);
+ // Initialize output tensor for transpose 1xW
+ TensorShape shape_transposed1xW(weights->info()->dimension(1) * 4, static_cast<size_t>(std::ceil(weights->info()->dimension(0) / 4.f)));
+ _transpose1xW_output.allocator()->init(TensorInfo(shape_transposed1xW, 1, weights->info()->data_type()));
+
+ // Configure im2col kernel
+ _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+ // Configure interleave4x4 kernel
+ _interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
+
+ // Configure transpose 1xW kernel
+ _transpose1xW_kernel.configure(weights, &_transpose1xW_output);
+
+ // Configure matrix multiply kernel
+ _mm_kernel.configure(&_interleave4x4_output, &_transpose1xW_output, output, 1.0f);
+
+ // Allocate the tensors once all the configure methods have been called
+ _im2col_output.allocator()->allocate();
+ _interleave4x4_output.allocator()->allocate();
+ _transpose1xW_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayer::configure_fc_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+ // Initialize output tensor for interleave 4x4
+ TensorShape shape_interleaved = input->info()->tensor_shape();
+ shape_interleaved.set(0, shape_interleaved.x() * 4);
+ shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
+ _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
+
+ // Initialize output tensor for transpose 1xW
+ TensorShape shape_transposed1xW(weights->info()->dimension(1) * 4, static_cast<size_t>(std::ceil(weights->info()->dimension(0) / 4.f)));
+ _transpose1xW_output.allocator()->init(TensorInfo(shape_transposed1xW, 1, weights->info()->data_type()));
+
+ // Configure interleave4x4 kernel
+ _interleave4x4_kernel.configure(input, &_interleave4x4_output);
+
+ // Configure transpose 1xW kernel
+ _transpose1xW_kernel.configure(weights, &_transpose1xW_output);
+
+ // Configure matrix multiply kernel
+ _mm_kernel.configure(&_interleave4x4_output, &_transpose1xW_output, output, 1.0f);
+
+ // Allocate the tensors once all the configure methods have been called
+ _interleave4x4_output.allocator()->allocate();
+ _transpose1xW_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayer::configure_conv_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+
+ // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+ // Initialize output tensor for im2col
+ TensorShape shape_im2col;
+ shape_im2col.set(0, weights->info()->dimension(1));
+ shape_im2col.set(1, 1);
+ _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+
+ // Configure im2col kernel
+ _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+ // Configure matrix multiply kernel
+ _mm_kernel.configure(&_im2col_output, weights, output, 1.0f);
+
+ // Allocate the output tensor for im2col once all the configure methods have been called
+ _im2col_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayer::configure_fc_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+ // Configure matrix multiply kernel
+ _mm_kernel.configure(input, weights, output, 1.0f);
+}
+
+void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+ ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
+
+ _is_first_run = true;
+ _transpose_weights = transpose_weights;
+ _fc_after_conv = true;
+ _batched_fc_layer = false;
+ _accumulate_biases = false;
+
+ const ITensor *weights_to_use = weights;
if(biases != nullptr)
{
- _acc_biases_kernel.configure(output, biases);
- _run_acc_biases = true;
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+
+ _accumulate_biases = true;
+
+ // Configure accumulate biases kernel
+ _accumulate_biases_kernel.configure(output, biases);
}
- _run_func = &NEFullyConnectedLayer::run_fc;
-
- // Allocate once all the configure methods have been called
- _weights_transposed.allocator()->allocate();
-}
-
-void NEFullyConnectedLayer::run_conv()
-{
- _conv_function.run();
-}
-
-void NEFullyConnectedLayer::run_fc()
-{
- if(_is_first_run)
+ // Check if we need to transpose the weights
+ if(_transpose_weights)
{
- _is_first_run = false;
- NEScheduler::get().multithread(&_transpose_kernel);
+ // Initialize the output tensor for transpose
+ TensorShape shape_transposed(weights->info()->dimension(1), weights->info()->dimension(0));
+ _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, weights->info()->data_type()));
+ _transpose_kernel.configure(weights, &_transpose_output);
+
+ weights_to_use = &_transpose_output;
}
- _gemm_function.run();
+ // With the Fully Connected layer we can have 4 different cases:
+ // 1) Convolution layer -> Fully Connected layer without batches
+ // 2) Fully Connected layer -> Fully Connected layer without batches
+ // 3) Convolution layer -> Fully Connected layer with batches
+ // 4) Fully Connected layer -> Fully Connected layer with batches
- if(_run_acc_biases)
+ // Check if we have a fully connected layer with batches
+ _batched_fc_layer = (output->info()->dimension(1) > 1);
+
+ if(_batched_fc_layer)
{
- NEScheduler::get().multithread(&_acc_biases_kernel);
+ _fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+ input->info()->tensor_shape().cend(),
+ output->info()->tensor_shape().cbegin() + 1));
+
+ if(_fc_after_conv)
+ {
+ // Fully Connected layer after a Convolution Layer with batches
+ configure_conv_fc_wb(input, weights_to_use, output);
+ }
+ else
+ {
+ // Fully Connected layer after a Fully Connected Layer with batches
+ configure_fc_fc_wb(input, weights_to_use, output);
+ }
+ }
+ else
+ {
+ _fc_after_conv = (weights_to_use->info()->dimension(1) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+
+ if(_fc_after_conv)
+ {
+ // Fully Connected layer after a Convolution Layer without batches
+ configure_conv_fc_nb(input, weights_to_use, output);
+ }
+ else
+ {
+ // Fully Connected layer after a Fully Connected Layer without batches
+ configure_fc_fc_nb(input, weights_to_use, output);
+ }
+ }
+
+ // Allocate the transpose tensor if the transpose_weights flag is true and once all the configure methods have been called
+ if(_transpose_weights)
+ {
+ _transpose_output.allocator()->allocate();
}
}
void NEFullyConnectedLayer::run()
{
- ARM_COMPUTE_ERROR_ON(_run_func == nullptr);
- (this->*_run_func)();
+ // Reshape of the weights (happens only once)
+ if(_is_first_run)
+ {
+ _is_first_run = false;
+ if(_transpose_weights)
+ {
+ NEScheduler::get().multithread(&_transpose_kernel);
+ }
+ if(_batched_fc_layer)
+ {
+ NEScheduler::get().multithread(&_transpose1xW_kernel);
+ }
+ }
+
+ // Linearize input if comes from a convolutional layer
+ if(_fc_after_conv)
+ {
+ NEScheduler::get().multithread(&_im2col_kernel);
+ }
+
+ // Interleave input
+ if(_batched_fc_layer)
+ {
+ NEScheduler::get().multithread(&_interleave4x4_kernel);
+ }
+
+ // Run matrix multiply
+ NEScheduler::get().multithread(&_mm_kernel);
+
+ // Accumulate biases if provided
+ if(_accumulate_biases)
+ {
+ NEScheduler::get().multithread(&_accumulate_biases_kernel);
+ }
}
diff --git a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
new file mode 100644
index 0000000..4c77c88
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+
+using namespace arm_compute;
+
+void NEGEMMInterleave4x4::configure(const ITensor *input, ITensor *output)
+{
+ auto k = arm_compute::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
new file mode 100644
index 0000000..8cba30d
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+void NEGEMMTranspose1xW::configure(const ITensor *input, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(1) * 4);
+ ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(input->info()->dimension(0) / 4.0f));
+ auto k = arm_compute::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index 1c75bee..cb8296b 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
@@ -76,23 +76,22 @@
TensorShape tensor_shape = pyramid->info()->tensor_shape();
tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
- PyramidInfo pyramid_info;
- pyramid_info.init(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::S16);
-
- _tmp.init_auto_padding(pyramid_info);
- _tmp.allocate();
+ PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::S16);
+ _tmp.init(pyramid_info);
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
- /* Configure border */
- _border_handler[i].configure(_pyramid->get_pyramid_level(i), 2, border_mode, PixelValue(constant_border_value));
-
/* Configure horizontal kernel */
_horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode == BorderMode::UNDEFINED);
/* Configure vertical kernel */
_vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), border_mode == BorderMode::UNDEFINED);
+
+ /* Configure border */
+ _border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
}
+
+ _tmp.allocate();
}
}
@@ -140,11 +139,8 @@
_scale_nearest = arm_compute::cpp14::make_unique<NEScaleKernel[]>(num_levels - 1);
_offsets = arm_compute::cpp14::make_unique<Image[]>(num_levels - 1);
- PyramidInfo pyramid_info;
- pyramid_info.init(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
-
- _tmp.init_auto_padding(pyramid_info);
- _tmp.allocate();
+ PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
+ _tmp.init(pyramid_info);
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
@@ -153,9 +149,7 @@
/* Allocate Image for the offsets used by NEAREST interpolation */
TensorInfo tensor_info(TensorShape(width, height), Format::S32);
- tensor_info.auto_padding();
_offsets[i].allocator()->init(tensor_info);
- _offsets[i].allocator()->allocate();
/* Configure gaussian 5x5 */
_gaus5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
@@ -163,7 +157,11 @@
/* Configure scale image kernel */
_scale_nearest[i].configure(_tmp.get_pyramid_level(i), nullptr, nullptr, _offsets.get() + i, _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR,
border_mode == BorderMode::UNDEFINED);
+
+ _offsets[i].allocator()->allocate();
}
+
+ _tmp.allocate();
}
}
diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
index 8edb9cb..a5073b9 100644
--- a/src/runtime/NEON/functions/NEHOGDescriptor.cpp
+++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
@@ -63,19 +63,13 @@
// Allocate memory for magnitude, phase and hog space
TensorInfo info_mag(shape_img, Format::S16);
- info_mag.auto_padding();
_mag.allocator()->init(info_mag);
- _mag.allocator()->allocate();
TensorInfo info_phase(shape_img, Format::U8);
- info_phase.auto_padding();
_phase.allocator()->init(info_phase);
- _phase.allocator()->allocate();
TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
- info_space.auto_padding();
_hog_space.allocator()->init(info_space);
- _hog_space.allocator()->allocate();
// Initialise gradient kernel
_gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
@@ -85,6 +79,11 @@
// Initialize HOG norm kernel
_block_norm.configure(&_hog_space, output, hog->info());
+
+ // Allocate intermediate tensors
+ _mag.allocator()->allocate();
+ _phase.allocator()->allocate();
+ _hog_space.allocator()->allocate();
}
void NEHOGDescriptor::run()
diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
index c82f0af..c5b37f4 100644
--- a/src/runtime/NEON/functions/NEHOGGradient.cpp
+++ b/src/runtime/NEON/functions/NEHOGGradient.cpp
@@ -45,11 +45,8 @@
// Allocate image memory
TensorInfo info(shape_img, Format::S16);
- info.auto_padding();
_gx.allocator()->init(info);
- _gx.allocator()->allocate();
_gy.allocator()->init(info);
- _gy.allocator()->allocate();
// Initialise derivate kernel
_derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
@@ -67,6 +64,10 @@
k->configure(&_gx, &_gy, output_magnitude, output_phase);
_mag_phase = std::move(k);
}
+
+ // Allocate intermediate tensors
+ _gx.allocator()->allocate();
+ _gy.allocator()->allocate();
}
void NEHOGGradient::run()
diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
index 4ebe80d..effa64f 100644
--- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
@@ -121,14 +121,10 @@
// Allocate tensors for magnitude and phase
TensorInfo info_mag(shape_img, Format::S16);
- info_mag.auto_padding();
_mag.allocator()->init(info_mag);
- _mag.allocator()->allocate();
TensorInfo info_phase(shape_img, Format::U8);
- info_phase.auto_padding();
_phase.allocator()->init(info_phase);
- _phase.allocator()->allocate();
// Initialise gradient kernel
_gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
@@ -142,7 +138,7 @@
const Size2D &cell = multi_hog->model(idx_multi_hog)->info()->cell_size();
const size_t num_bins = multi_hog->model(idx_multi_hog)->info()->num_bins();
- // Calculate number of cells along the x and y directions for the hog_space */
+ // Calculate number of cells along the x and y directions for the hog_space
const size_t num_cells_x = width / cell.width;
const size_t num_cells_y = height / cell.height;
@@ -153,9 +149,7 @@
// Allocate HOG space
TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
- info_space.auto_padding();
_hog_space[i].allocator()->init(info_space);
- _hog_space[i].allocator()->allocate();
// Initialise orientation binning kernel
_orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
@@ -169,9 +163,7 @@
// Allocate normalized HOG space
TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
- tensor_info.auto_padding();
_hog_norm_space[i].allocator()->init(tensor_info);
- _hog_norm_space[i].allocator()->allocate();
// Initialize block normalization kernel
_block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
@@ -187,6 +179,20 @@
// Configure non maxima suppression kernel
_non_maxima_kernel->configure(_detection_windows, min_distance);
+
+ // Allocate intermediate tensors
+ _mag.allocator()->allocate();
+ _phase.allocator()->allocate();
+
+ for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+ {
+ _hog_space[i].allocator()->allocate();
+ }
+
+ for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+ {
+ _hog_norm_space[i].allocator()->allocate();
+ }
}
void NEHOGMultiDetection::run()
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index 0f5215f..adefd47 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp
@@ -55,31 +55,27 @@
ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
const TensorShape shape = input->info()->tensor_shape();
- TensorInfo tensor_info;
+ TensorInfo tensor_info_gxgy;
- /* Allocate memory */
if(gradient_size < 7)
{
- tensor_info.init_auto_padding(shape, Format::S16);
+ tensor_info_gxgy.init(shape, Format::S16);
}
else
{
- tensor_info.init_auto_padding(shape, Format::S32);
+ tensor_info_gxgy.init(shape, Format::S32);
}
- _gx.allocator()->init(tensor_info);
- _gx.allocator()->allocate();
- _gy.allocator()->init(tensor_info);
- _gy.allocator()->allocate();
+ _gx.allocator()->init(tensor_info_gxgy);
+ _gy.allocator()->init(tensor_info_gxgy);
- tensor_info.init_auto_padding(shape, Format::F32);
- _score.allocator()->init(tensor_info);
- _score.allocator()->allocate();
- _nonmax.allocator()->init(tensor_info);
- _nonmax.allocator()->allocate();
+ TensorInfo tensor_info_score(shape, Format::F32);
+ _score.allocator()->init(tensor_info_score);
+ _nonmax.allocator()->init(tensor_info_score);
+
_corners_list = arm_compute::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
- /* Set/init Sobel kernel accordingly with gradient_size */
+ // Set/init Sobel kernel accordingly with gradient_size
switch(gradient_size)
{
case 3:
@@ -107,11 +103,7 @@
ARM_COMPUTE_ERROR("Gradient size not implemented");
}
- /* Configure border filling before harris score*/
- _border_gx.configure(&_gx, block_size / 2, border_mode, constant_border_value);
- _border_gy.configure(&_gy, block_size / 2, border_mode, constant_border_value);
-
- /* Normalization factor */
+ // Normalization factor
const float norm_factor = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size);
if(use_fp16)
@@ -144,7 +136,7 @@
}
else
{
- /* Set/init Harris Score kernel accordingly with block_size */
+ // Set/init Harris Score kernel accordingly with block_size
switch(block_size)
{
case 3:
@@ -171,38 +163,50 @@
break;
}
}
- /* Init non-maxima suppression function */
+
+ // Configure border filling before harris score
+ _border_gx.configure(&_gx, _harris_score->border_size(), border_mode, constant_border_value);
+ _border_gy.configure(&_gy, _harris_score->border_size(), border_mode, constant_border_value);
+
+ // Init non-maxima suppression function
_non_max_suppr.configure(&_score, &_nonmax, border_mode);
- /* Init corner candidates kernel */
+ // Init corner candidates kernel
_candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
- /* Init euclidean distance*/
+ // Init euclidean distance
_sort_euclidean.configure(_corners_list.get(), corners, &_num_corner_candidates, min_dist);
+
+ // Allocate once all the configure methods have been called
+ _gx.allocator()->allocate();
+ _gy.allocator()->allocate();
+ _score.allocator()->allocate();
+ _nonmax.allocator()->allocate();
}
void NEHarrisCorners::run()
{
ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
- /* Init to 0 number of corner candidates */
+ // Init to 0 number of corner candidates
_num_corner_candidates = 0;
- /* Run Sobel kernel */
+ // Run Sobel kernel
_sobel->run();
- /* Fill border before harris score kernel */
+ // Fill border before harris score kernel
_border_gx.run(_border_gx.window());
_border_gy.run(_border_gy.window());
- /* Run harris score kernel */
+ // Run harris score kernel
NEScheduler::get().multithread(_harris_score.get());
- /* Run non-maxima suppression */
+ // Run non-maxima suppression
_non_max_suppr.run();
- /* Run corner candidate kernel */
+ // Run corner candidate kernel
NEScheduler::get().multithread(&_candidates);
+ // Run sort & euclidean distance
_sort_euclidean.run(_sort_euclidean.window());
}
diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
index 2065f3c..8232c79 100644
--- a/src/runtime/NEON/functions/NELaplacianPyramid.cpp
+++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
@@ -45,16 +45,19 @@
{
ARM_COMPUTE_ERROR_ON_MSG(0 == _num_levels, "Unconfigured function");
- _gaussian_pyr_function.run(); // compute gaussian pyramid
+ // Compute Gaussian Pyramid
+ _gaussian_pyr_function.run();
for(unsigned int i = 0; i < _num_levels; ++i)
{
- _convf[i].run(); // convolute gaussian pyramid
+ // Apply Gaussian filter to gaussian pyramid image
+ _convf[i].run();
}
for(unsigned int i = 0; i < _num_levels; ++i)
{
- _subf[i].run(); // compute laplacian image
+ // Compute laplacian image
+ _subf[i].run();
}
_depth_function.run();
@@ -77,10 +80,8 @@
PyramidInfo pyramid_info;
pyramid_info.init(_num_levels, 0.5f, pyramid->info()->tensor_shape(), arm_compute::Format::U8);
- _gauss_pyr.init_auto_padding(pyramid_info);
- _gauss_pyr.allocate();
- _conv_pyr.init_auto_padding(pyramid_info);
- _conv_pyr.allocate();
+ _gauss_pyr.init(pyramid_info);
+ _conv_pyr.init(pyramid_info);
// Create Gaussian Pyramid function
_gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
@@ -95,4 +96,7 @@
}
_depth_function.configure(_conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0);
+
+ _gauss_pyr.allocate();
+ _conv_pyr.allocate();
}
diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
index dc59e14..36ac4a7 100644
--- a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
+++ b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
@@ -57,8 +57,8 @@
// Create and initialize the tmp pyramid: I(n-2) = upsample( input + Laplace(n-1) )
PyramidInfo pyramid_info;
pyramid_info.init(num_levels, 0.5f, output->info()->tensor_shape(), arm_compute::Format::S16);
- _tmp_pyr.init_auto_padding(pyramid_info);
- _tmp_pyr.allocate();
+
+ _tmp_pyr.init(pyramid_info);
// Allocate add and scale functions. Level 0 does not need to be scaled.
_addf = arm_compute::cpp14::make_unique<NEArithmeticAddition[]>(num_levels);
@@ -77,6 +77,8 @@
// Convert level 0 from S16 to U8
_depthf.configure(_tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0);
+
+ _tmp_pyr.allocate();
}
void NELaplacianReconstruct::run()
diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
index 5297991..bd89a0b 100644
--- a/src/runtime/NEON/functions/NEOpticalFlow.cpp
+++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp
@@ -87,12 +87,9 @@
const unsigned int height_ith = new_ith_input->info()->dimension(1);
TensorInfo tensor_info(TensorShape(width_ith, height_ith), Format::S16);
- tensor_info.auto_padding();
_scharr_gx[i].allocator()->init(tensor_info);
- _scharr_gx[i].allocator()->allocate();
_scharr_gy[i].allocator()->init(tensor_info);
- _scharr_gy[i].allocator()->allocate();
/* Init Scharr kernel */
_func_scharr[i].configure(old_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i, border_mode, constant_border_value);
@@ -103,6 +100,9 @@
&_old_points_internal, &_new_points_internal,
termination, use_initial_estimate, epsilon, num_iterations, window_dimension,
i, _num_levels, pyr_scale, border_offset);
+
+ _scharr_gx[i].allocator()->allocate();
+ _scharr_gy[i].allocator()->allocate();
}
}
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index bd8cf3a..b70f626 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -41,7 +41,7 @@
namespace
{
-void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_stride, size_t input_element_size)
+void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size)
{
ARM_COMPUTE_ERROR_ON(nullptr == offsets);
@@ -63,7 +63,7 @@
const int in_xi = std::floor(in_x);
const int in_yi = std::floor(in_y);
- *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size + in_yi * input_stride;
+ *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
*reinterpret_cast<float *>(dx_it.ptr()) = in_x - in_xi;
*reinterpret_cast<float *>(dy_it.ptr()) = in_y - in_yi;
},
@@ -77,9 +77,8 @@
execute_window_loop(win, [&](const Coordinates & id)
{
const size_t in_xi = (id.x() + 0.5f) * wr;
- const size_t in_yi = (id.y() + 0.5f) * hr;
- *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size + in_yi * input_stride;
+ *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
},
offsets_it);
}
@@ -108,8 +107,7 @@
const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
- // Get the input stride and the input element size
- const size_t input_stride = input->info()->strides_in_bytes()[1];
+ // Get the element size of the input image
const size_t input_element_size = input->info()->element_size();
// Area interpolation behaves as Nearest Neighbour in case of up-sampling
@@ -128,7 +126,6 @@
case InterpolationPolicy::NEAREST_NEIGHBOR:
{
TensorInfo tensor_info_offsets(shape, Format::S32);
- tensor_info_offsets.auto_padding();
_offsets.allocator()->init(tensor_info_offsets);
k->configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined);
@@ -137,7 +134,7 @@
_offsets.allocator()->allocate();
// Pre-compute offsets for nearest interpolation
- precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_stride, input_element_size);
+ precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size);
break;
}
case InterpolationPolicy::BILINEAR:
@@ -145,9 +142,6 @@
TensorInfo tensor_info_offsets(shape, Format::S32);
TensorInfo tensor_info_dxdy(shape, Format::F32);
- tensor_info_offsets.auto_padding();
- tensor_info_dxdy.auto_padding();
-
_offsets.allocator()->init(tensor_info_offsets);
_dx.allocator()->init(tensor_info_dxdy);
_dy.allocator()->init(tensor_info_dxdy);
@@ -160,7 +154,7 @@
_dy.allocator()->allocate();
// Pre-compute dx, dy and offsets for bilinear interpolation
- precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_stride, input_element_size);
+ precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size);
break;
}
case InterpolationPolicy::AREA:
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 3949529..55d4d3a 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -32,7 +32,7 @@
using namespace arm_compute;
NESoftmaxLayer::NESoftmaxLayer()
- : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _max(), _sum(), _tmp()
+ : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _fill_border_kernel_sum(), _max(), _sum(), _tmp()
{
}
@@ -42,31 +42,34 @@
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
// Create intermediate tensors shapes
- TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type());
- tensor_info_tmp.auto_padding();
- _tmp.allocator()->init(tensor_info_tmp);
- _tmp.allocator()->allocate();
+ _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
TensorShape shape = input->info()->tensor_shape();
shape.set(0, 1);
TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type());
- tensor_info_max_sum.auto_padding();
_max.allocator()->init(tensor_info_max_sum);
- _max.allocator()->allocate();
_sum.allocator()->init(tensor_info_max_sum);
- _sum.allocator()->allocate();
// Configure Kernels
- _fill_border_kernel.configure(input, 3, BorderMode::CONSTANT, PixelValue(-FLT_MAX));
_max_kernel.configure(input, &_max);
_shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
_norm_kernel.configure(&_tmp, &_sum, output);
+ _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::CONSTANT, PixelValue(-FLT_MAX));
+ // Fill the border around tmp buffer with sensible negative value.
+ // This avoids exp(-FLT_MAX) which will lead to -inf and destroy the calculation of sum when input is not a multiple of processed elements
+ _fill_border_kernel_sum.configure(input, _shift_exp_sum_kernel.border_size(), BorderMode::CONSTANT, PixelValue(-50.f));
+
+ // Allocate intermediate tensors
+ _tmp.allocator()->allocate();
+ _max.allocator()->allocate();
+ _sum.allocator()->allocate();
}
void NESoftmaxLayer::run()
{
NEScheduler::get().multithread(&_fill_border_kernel);
NEScheduler::get().multithread(&_max_kernel);
+ NEScheduler::get().multithread(&_fill_border_kernel_sum);
NEScheduler::get().multithread(&_shift_exp_sum_kernel);
NEScheduler::get().multithread(&_norm_kernel);
}