arm_compute v17.06
diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp
index c99d59b..6f0da85 100644
--- a/src/runtime/NEON/INESimpleFunction.cpp
+++ b/src/runtime/NEON/INESimpleFunction.cpp
@@ -35,5 +35,5 @@
void INESimpleFunction::run()
{
_border_handler.run(_border_handler.window());
- NEScheduler::get().multithread(_kernel.get());
+ NEScheduler::get().schedule(_kernel.get(), Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
new file mode 100644
index 0000000..a24429c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEBatchNormalizationLayer::NEBatchNormalizationLayer()
+ : _norm_kernel()
+{
+}
+
+void NEBatchNormalizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
+{
+ // Configure kernel
+ _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
+}
+
+void NEBatchNormalizationLayer::run()
+{
+ NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index 2d7ad86..26f31f5 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -153,10 +153,10 @@
_border_mag_gradient.run(_border_mag_gradient.window());
// Run gradient
- NEScheduler::get().multithread(_gradient.get());
+ NEScheduler::get().schedule(_gradient.get(), Window::DimY);
// Run non-maxima suppression
- NEScheduler::get().multithread(&_non_max_suppr);
+ NEScheduler::get().schedule(&_non_max_suppr, Window::DimY);
ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
memset(_output->buffer(), 0, _output->info()->total_size());
diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
index c2b3d7a..3f39ae2 100644
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ b/src/runtime/NEON/functions/NEConvolution.cpp
@@ -98,12 +98,12 @@
if(_is_separable)
{
- NEScheduler::get().multithread(&_kernel_hor);
- NEScheduler::get().multithread(&_kernel_vert);
+ NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
+ NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
}
else
{
- NEScheduler::get().multithread(&_kernel);
+ NEScheduler::get().schedule(&_kernel, Window::DimY);
}
}
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index aae4a67..bd688cf 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -33,33 +33,93 @@
using namespace arm_compute;
-NEConvolutionLayer::NEConvolutionLayer()
- : _input_im2col_kernel(), _input_interleave_kernel(), _weights_reshape_kernel(), _weights_transposed_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(),
- _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _is_first_run(false), _has_bias(false)
+NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights()
+ : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
{
}
-void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
if(biases != nullptr)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases);
ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
}
- _has_bias = (biases != nullptr);
- _is_first_run = true;
+ // Check if bias are present, if yes they will be embedded to the weights matrix
+ const bool _has_bias = (biases != nullptr);
- // Get parameters for conv_info
+ _transpose1xW = transpose1xW;
+
+ if(transpose1xW)
+ {
+ // Create tensor to store the reshaped weights
+ const unsigned int mat_weights_cols = weights->info()->dimension(3);
+ const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+ TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
+ TensorInfo info_wr(shape_wr, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
+
+ _weights_reshaped.allocator()->init(info_wr);
+ _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+ _weights_transposed_kernel.configure(&_weights_reshaped, output);
+ _weights_reshaped.allocator()->allocate();
+ }
+ else
+ {
+ _weights_reshape_kernel.configure(weights, biases, output);
+ }
+}
+
+void NEConvolutionLayerReshapeWeights::run()
+{
+ NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+ if(_transpose1xW)
+ {
+ NEScheduler::get().schedule(&_weights_transposed_kernel, Window::DimY);
+ }
+}
+
+NEConvolutionLayer::NEConvolutionLayer()
+ : _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(),
+ _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+{
+}
+
+void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights, output);
+ ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+ ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
+ ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+ }
+
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
+
+ _has_bias = (biases != nullptr);
+ _are_weights_reshaped = weights_info.are_reshaped();
+
+ // Get parameters from conv_info
unsigned int stride_x = 0;
unsigned int stride_y = 0;
unsigned int pad_x = 0;
@@ -70,21 +130,46 @@
// Get convolved dimensions
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
+
+ const unsigned int kernel_width = (_are_weights_reshaped) ? weights_info.kernel_size() : weights->info()->dimension(0);
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width,
stride_x, stride_y, pad_x, pad_y, conv_info.round());
ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
- // Create tensor to store the reshaped weights
- const unsigned int mat_weights_cols = weights->info()->dimension(3);
- const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
- TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
- TensorInfo info_wr(shape_wr, 1, weights->info()->data_type());
- _weights_reshaped.allocator()->init(info_wr);
+ // Check if its a "fully connected" convolution
+ _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
- // Create tensor to store transposed weights
- TensorShape shape_wt(mat_weights_rows * 4, static_cast<unsigned int>(std::ceil(mat_weights_cols / 4.f)));
- TensorInfo info_wt(shape_wt, 1, weights->info()->data_type());
- _weights_transposed.allocator()->init(info_wt);
+ unsigned int mat_weights_cols = weights->info()->dimension(3);
+ unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+
+ // Reshape weights if needed
+ if(_are_weights_reshaped)
+ {
+ mat_weights_cols = output->info()->dimension(2);
+ const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
+ mat_weights_rows = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
+ }
+ else
+ {
+ if(_is_fully_connected_convolution)
+ {
+ // Create tensor to store the reshaped weights
+ TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
+ TensorInfo info_wr(shape_wr, 1, dt, fixed_point_position);
+ _weights_reshaped.allocator()->init(info_wr);
+ _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+ }
+ else
+ {
+ // Create tensor to store transposed weights
+ const float transpose_width = 16.0f / input->info()->element_size();
+ TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
+ TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
+ _weights_reshaped.allocator()->init(info_wt);
+ _reshape_weights.configure(weights, biases, &_weights_reshaped, true /* 1xW transpose */);
+ }
+ weights = &_weights_reshaped;
+ }
// Create tensor to store im2col reshaped inputs
const unsigned int mat_input_cols = mat_weights_rows;
@@ -93,58 +178,69 @@
shape_im2col.set(0, mat_input_cols);
shape_im2col.set(1, mat_input_rows);
shape_im2col.set(2, 1);
- TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type());
- _input_im2col_reshaped.allocator()->init(info_im2col);
+ _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
- // Create tensor to prepare input tensor for GEMM
- TensorShape shape_interleaved = shape_im2col;
- shape_interleaved.set(0, shape_interleaved.x() * 4);
- shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
- TensorInfo info_interleaved(shape_interleaved, 1, input->info()->data_type());
- _input_interleaved_reshaped.allocator()->init(info_interleaved);
+ // Create tensor (interleave) to prepare input tensor for GEMM
+ if(!_is_fully_connected_convolution)
+ {
+ TensorShape shape_interleaved = shape_im2col;
+ shape_interleaved.set(0, shape_interleaved.x() * 4);
+ shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
+ _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+ }
// Create GEMM output tensor
TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
shape_gemm.set(0, mat_weights_cols);
shape_gemm.set(1, mat_input_rows);
- TensorInfo info_gemm(shape_gemm, 1, input->info()->data_type());
- _gemm_output.allocator()->init(info_gemm);
+ _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, dt, fixed_point_position));
// Configure kernels
_input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
- _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
- _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
- _weights_transposed_kernel.configure(&_weights_reshaped, &_weights_transposed);
- _mm_kernel.configure(&_input_interleaved_reshaped, &_weights_transposed, &_gemm_output, 1.0f);
+ if(_is_fully_connected_convolution)
+ {
+ _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+ }
+ else
+ {
+ _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+ _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
+ }
_output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
- // Allocate the tensors once the all configure methods have been called
- _weights_reshaped.allocator()->allocate();
- _weights_transposed.allocator()->allocate();
+ // Allocate intermediate tensor
+ if(!_are_weights_reshaped)
+ {
+ _weights_reshaped.allocator()->allocate();
+ }
_input_im2col_reshaped.allocator()->allocate();
- _input_interleaved_reshaped.allocator()->allocate();
+ if(!_is_fully_connected_convolution)
+ {
+ _input_interleaved_reshaped.allocator()->allocate();
+ }
_gemm_output.allocator()->allocate();
}
void NEConvolutionLayer::run()
{
// Run weights reshaping (Runs once for every configure)
- if(_is_first_run)
+ if(!_are_weights_reshaped)
{
- _is_first_run = false;
- NEScheduler::get().multithread(&_weights_reshape_kernel, 3);
- NEScheduler::get().multithread(&_weights_transposed_kernel);
+ _are_weights_reshaped = true;
+ _reshape_weights.run();
}
// Run input reshaping
- NEScheduler::get().multithread(&_input_im2col_kernel);
+ NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
+ if(!_is_fully_connected_convolution)
+ {
+ // Run interleave
+ NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
+ }
- // Run interleave
- NEScheduler::get().multithread(&_input_interleave_kernel);
-
- // Runs GEMM on reshaped matrices
- NEScheduler::get().multithread(&_mm_kernel);
+ // Runs matrix multiply on reshaped matrices
+ NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
// Reshape output matrix
- NEScheduler::get().multithread(&_output_col2im_kernel);
+ NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEDepthConcatenate.cpp b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
new file mode 100644
index 0000000..7d2c549
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEDepthConcatenate::NEDepthConcatenate()
+ : _inputs_vector(), _concat_kernels_vector(), _border_handlers_vector(), _num_inputs(0)
+{
+}
+
+void NEDepthConcatenate::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
+
+ _num_inputs = inputs_vector.size();
+ _concat_kernels_vector = arm_compute::cpp14::make_unique<NEDepthConcatenateKernel[]>(_num_inputs);
+ _border_handlers_vector = arm_compute::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
+
+ unsigned int depth_offset = 0;
+ for(unsigned int i = 0; i < _num_inputs; ++i)
+ {
+ _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
+ _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+ depth_offset += inputs_vector.at(i)->info()->dimension(2);
+ }
+}
+
+void NEDepthConcatenate::run()
+{
+ for(unsigned i = 0; i < _num_inputs; ++i)
+ {
+ NEScheduler::get().schedule(&_border_handlers_vector[i], Window::DimX);
+ NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimX);
+ }
+}
diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEDepthConvert.cpp
index 5f3594a..a339cae 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvert.cpp
@@ -33,8 +33,8 @@
void NEDepthConvert::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, DataType::U32, DataType::S32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::U16, DataType::S16, DataType::U32, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
ARM_COMPUTE_ERROR_ON(input == output);
ARM_COMPUTE_ERROR_ON(input->info()->data_type() == output->info()->data_type());
diff --git a/src/runtime/NEON/functions/NEDerivative.cpp b/src/runtime/NEON/functions/NEDerivative.cpp
index 340e1ce..2887c13 100644
--- a/src/runtime/NEON/functions/NEDerivative.cpp
+++ b/src/runtime/NEON/functions/NEDerivative.cpp
@@ -48,5 +48,5 @@
void NEDerivative::run()
{
_border_handler.run(_border_handler.window());
- NEScheduler::get().multithread(&_kernel);
+ NEScheduler::get().schedule(&_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
new file mode 100644
index 0000000..3f3e771
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+NEDirectConvolutionLayer::NEDirectConvolutionLayer()
+ : _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator()
+{
+}
+
+void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+
+ // Free accumulator
+ if(_accumulator.buffer() != nullptr)
+ {
+ _accumulator.allocator()->free();
+ }
+
+ // Allocate the intermediate accumulator tensor in case of fixed point input
+ if(output->info()->data_type() == DataType::QS8)
+ {
+ _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
+ _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+ _accumulate_bias_kernel.configure(&_accumulator, bias, output);
+ _accumulator.allocator()->allocate();
+ }
+ else
+ {
+ _conv_kernel.configure(input, weights, output, conv_info);
+ _accumulate_bias_kernel.configure(output, bias);
+ }
+
+ // Add zero padding XY
+ _input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+void NEDirectConvolutionLayer::run()
+{
+ _input_border_handler.run(_input_border_handler.window());
+
+ NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
+ NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
index a8b132d..f6ec677 100644
--- a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
+++ b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
@@ -52,11 +52,11 @@
void NEEqualizeHistogram::run()
{
// Calculate histogram of input.
- NEScheduler::get().multithread(&_histogram_kernel);
+ NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
// Calculate cumulative distribution of histogram and create LUT.
_cd_histogram_kernel.run(_cd_histogram_kernel.window());
// Map input to output using created LUT.
- NEScheduler::get().multithread(&_map_histogram_kernel);
+ NEScheduler::get().schedule(&_map_histogram_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
index 670b4d4..33a58f1 100644
--- a/src/runtime/NEON/functions/NEFastCorners.cpp
+++ b/src/runtime/NEON/functions/NEFastCorners.cpp
@@ -90,12 +90,12 @@
{
_border_handler.run(_border_handler.window());
- NEScheduler::get().multithread(&_fast_corners_kernel);
+ NEScheduler::get().schedule(&_fast_corners_kernel, Window::DimY);
if(_non_max)
{
- NEScheduler::get().multithread(&_nonmax_kernel);
+ NEScheduler::get().schedule(&_nonmax_kernel, Window::DimY);
}
- NEScheduler::get().multithread(&_fill_kernel);
+ NEScheduler::get().schedule(&_fill_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp
index 7ff8f2f..e884f4a 100644
--- a/src/runtime/NEON/functions/NEFillBorder.cpp
+++ b/src/runtime/NEON/functions/NEFillBorder.cpp
@@ -35,5 +35,5 @@
void NEFillBorder::run()
{
- NEScheduler::get().multithread(&_border_handler, Window::DimZ);
+ NEScheduler::get().schedule(&_border_handler, Window::DimZ);
}
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index e6785b3..abb41e9 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -31,35 +31,99 @@
using namespace arm_compute;
+NEFullyConnectedLayerReshapeWeights::NEFullyConnectedLayerReshapeWeights()
+ : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+{
+}
+
+void NEFullyConnectedLayerReshapeWeights::configure(const ITensor *input, ITensor *output, bool transpose_weights, bool is_batched_fc_layer)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+ ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
+ ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
+
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
+
+ _transpose_weights = transpose_weights;
+ _is_batched_fc_layer = is_batched_fc_layer;
+
+ // Check if we need to transpose the weights
+ if(_transpose_weights)
+ {
+ if(_is_batched_fc_layer)
+ {
+ // Initialize the output tensor for transpose
+ TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
+ _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, dt, fixed_point_position));
+ _transpose_kernel.configure(input, &_transpose_output);
+
+ // Configure transpose 1xW kernel
+ _transpose1xW_kernel.configure(&_transpose_output, output);
+
+ // Allocate temporary tensor used for transposing the weights
+ _transpose_output.allocator()->allocate();
+ }
+ else
+ {
+ _transpose_kernel.configure(input, output);
+ }
+ }
+ else
+ {
+ if(_is_batched_fc_layer)
+ {
+ // Configure transpose 1xW kernel
+ _transpose1xW_kernel.configure(input, output);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
+ }
+ }
+}
+
+void NEFullyConnectedLayerReshapeWeights::run()
+{
+ if(_transpose_weights)
+ {
+ NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+ }
+ if(_is_batched_fc_layer)
+ {
+ NEScheduler::get().schedule(&_transpose1xW_kernel, Window::DimY);
+ }
+}
+
NEFullyConnectedLayer::NEFullyConnectedLayer()
- : _im2col_kernel(), _transpose_kernel(), _transpose1xW_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _transpose_output(),
- _transpose1xW_output(), _is_first_run(true), _transpose_weights(true), _fc_after_conv(false), _batched_fc_layer(false), _accumulate_biases(false)
+ : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
+ _are_weights_reshaped(false), _is_fc_after_conv(false), _is_batched_fc_layer(false), _accumulate_biases(false)
{
}
void NEFullyConnectedLayer::configure_conv_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON(weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2) * (16 / weights->info()->element_size())));
+
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
// If the fully connected layer is called after a convolution layer, the input tensor must be linearized
// Initialize output tensor for im2col
TensorShape shape_im2col;
- shape_im2col.set(0, weights->info()->dimension(1));
+ shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
shape_im2col.set(1, input->info()->dimension(3));
shape_im2col.set(2, input->info()->dimension(4));
shape_im2col.set(3, input->info()->dimension(5));
- _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+ _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
// Initialize output tensor for interleave 4x4
TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
shape_interleaved.set(0, shape_interleaved.x() * 4);
shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
- _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
-
- // Initialize output tensor for transpose 1xW
- TensorShape shape_transposed1xW(weights->info()->dimension(1) * 4, static_cast<size_t>(std::ceil(weights->info()->dimension(0) / 4.f)));
- _transpose1xW_output.allocator()->init(TensorInfo(shape_transposed1xW, 1, weights->info()->data_type()));
+ _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
// Configure im2col kernel
_im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
@@ -67,55 +131,49 @@
// Configure interleave4x4 kernel
_interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
- // Configure transpose 1xW kernel
- _transpose1xW_kernel.configure(weights, &_transpose1xW_output);
-
// Configure matrix multiply kernel
- _mm_kernel.configure(&_interleave4x4_output, &_transpose1xW_output, output, 1.0f);
+ _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
// Allocate the tensors once all the configure methods have been called
_im2col_output.allocator()->allocate();
_interleave4x4_output.allocator()->allocate();
- _transpose1xW_output.allocator()->allocate();
}
void NEFullyConnectedLayer::configure_fc_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
{
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
+
// Initialize output tensor for interleave 4x4
TensorShape shape_interleaved = input->info()->tensor_shape();
shape_interleaved.set(0, shape_interleaved.x() * 4);
shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
- _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
-
- // Initialize output tensor for transpose 1xW
- TensorShape shape_transposed1xW(weights->info()->dimension(1) * 4, static_cast<size_t>(std::ceil(weights->info()->dimension(0) / 4.f)));
- _transpose1xW_output.allocator()->init(TensorInfo(shape_transposed1xW, 1, weights->info()->data_type()));
+ _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
// Configure interleave4x4 kernel
_interleave4x4_kernel.configure(input, &_interleave4x4_output);
- // Configure transpose 1xW kernel
- _transpose1xW_kernel.configure(weights, &_transpose1xW_output);
-
// Configure matrix multiply kernel
- _mm_kernel.configure(&_interleave4x4_output, &_transpose1xW_output, output, 1.0f);
+ _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
// Allocate the tensors once all the configure methods have been called
_interleave4x4_output.allocator()->allocate();
- _transpose1xW_output.allocator()->allocate();
}
void NEFullyConnectedLayer::configure_conv_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output)
{
ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
+
// If the fully connected layer is called after a convolution layer, the input tensor must be linearized
// Initialize output tensor for im2col
TensorShape shape_im2col;
- shape_im2col.set(0, weights->info()->dimension(1));
+ shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
shape_im2col.set(1, 1);
- _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+ _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
// Configure im2col kernel
_im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
@@ -135,20 +193,21 @@
_mm_kernel.configure(input, weights, output, 1.0f);
}
-void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights)
+void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights, bool are_weights_reshaped)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
- _is_first_run = true;
- _transpose_weights = transpose_weights;
- _fc_after_conv = true;
- _batched_fc_layer = false;
- _accumulate_biases = false;
+ const DataType dt = input->info()->data_type();
+ const int fixed_point_position = input->info()->fixed_point_position();
- const ITensor *weights_to_use = weights;
+ _are_weights_reshaped = are_weights_reshaped;
+ _is_fc_after_conv = true;
+ _is_batched_fc_layer = false;
+ _accumulate_biases = false;
if(biases != nullptr)
{
@@ -160,17 +219,6 @@
_accumulate_biases_kernel.configure(output, biases);
}
- // Check if we need to transpose the weights
- if(_transpose_weights)
- {
- // Initialize the output tensor for transpose
- TensorShape shape_transposed(weights->info()->dimension(1), weights->info()->dimension(0));
- _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, weights->info()->data_type()));
- _transpose_kernel.configure(weights, &_transpose_output);
-
- weights_to_use = &_transpose_output;
- }
-
// With the Fully Connected layer we can have 4 different cases:
// 1) Convolution layer -> Fully Connected layer without batches
// 2) Fully Connected layer -> Fully Connected layer without batches
@@ -178,15 +226,54 @@
// 4) Fully Connected layer -> Fully Connected layer with batches
// Check if we have a fully connected layer with batches
- _batched_fc_layer = (output->info()->dimension(1) > 1);
+ _is_batched_fc_layer = (output->info()->dimension(1) > 1);
- if(_batched_fc_layer)
+ const ITensor *weights_to_use = weights;
+
+ if(!are_weights_reshaped)
{
- _fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
- input->info()->tensor_shape().cend(),
- output->info()->tensor_shape().cbegin() + 1));
+ if((transpose_weights || _is_batched_fc_layer))
+ {
+ weights_to_use = &_reshape_weights_output;
- if(_fc_after_conv)
+ if(transpose_weights)
+ {
+ if(_is_batched_fc_layer)
+ {
+ const float transpose_width = 16.0f / input->info()->element_size();
+ TensorShape shape_wt(weights->info()->dimension(0) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(1) / transpose_width)));
+ TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
+ _reshape_weights_output.allocator()->init(info_wt);
+ }
+ else
+ {
+ TensorShape shape_wt(weights->info()->dimension(1), weights->info()->dimension(0));
+ TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
+ _reshape_weights_output.allocator()->init(info_wt);
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON(!_is_batched_fc_layer);
+
+ const float transpose_width = 16.0f / input->info()->element_size();
+ TensorShape shape_wt(weights->info()->dimension(1) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(0) / transpose_width)));
+ TensorInfo info_wt(shape_wt, 1, dt, fixed_point_position);
+ _reshape_weights_output.allocator()->init(info_wt);
+ }
+
+ // Reshape the weights
+ _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+ }
+ }
+
+ if(_is_batched_fc_layer)
+ {
+ _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+ input->info()->tensor_shape().cend(),
+ output->info()->tensor_shape().cbegin() + 1));
+
+ if(_is_fc_after_conv)
{
// Fully Connected layer after a Convolution Layer with batches
configure_conv_fc_wb(input, weights_to_use, output);
@@ -199,9 +286,10 @@
}
else
{
- _fc_after_conv = (weights_to_use->info()->dimension(1) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+ // In case of not batched fully connected layer, the weights will not be reshaped using transposed1xW
+ _is_fc_after_conv = ((weights_to_use->info()->dimension(1)) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
- if(_fc_after_conv)
+ if(_is_fc_after_conv)
{
// Fully Connected layer after a Convolution Layer without batches
configure_conv_fc_nb(input, weights_to_use, output);
@@ -213,47 +301,44 @@
}
}
- // Allocate the transpose tensor if the transpose_weights flag is true and once all the configure methods have been called
- if(_transpose_weights)
+ // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
+ if(!are_weights_reshaped)
{
- _transpose_output.allocator()->allocate();
+ if(transpose_weights || _is_batched_fc_layer)
+ {
+ // Allocate the tensor for the weights reshaped
+ _reshape_weights_output.allocator()->allocate();
+ }
}
}
void NEFullyConnectedLayer::run()
{
// Reshape of the weights (happens only once)
- if(_is_first_run)
+ if(!_are_weights_reshaped)
{
- _is_first_run = false;
- if(_transpose_weights)
- {
- NEScheduler::get().multithread(&_transpose_kernel);
- }
- if(_batched_fc_layer)
- {
- NEScheduler::get().multithread(&_transpose1xW_kernel);
- }
+ _are_weights_reshaped = true;
+ _reshape_weights_kernel.run();
}
// Linearize input if comes from a convolutional layer
- if(_fc_after_conv)
+ if(_is_fc_after_conv)
{
- NEScheduler::get().multithread(&_im2col_kernel);
+ NEScheduler::get().schedule(&_im2col_kernel, Window::DimY);
}
// Interleave input
- if(_batched_fc_layer)
+ if(_is_batched_fc_layer)
{
- NEScheduler::get().multithread(&_interleave4x4_kernel);
+ NEScheduler::get().schedule(&_interleave4x4_kernel, Window::DimY);
}
// Run matrix multiply
- NEScheduler::get().multithread(&_mm_kernel);
+ NEScheduler::get().schedule(&_mm_kernel, _is_batched_fc_layer ? Window::DimY : Window::DimX);
// Accumulate biases if provided
if(_accumulate_biases)
{
- NEScheduler::get().multithread(&_accumulate_biases_kernel);
+ NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
}
}
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index f155dd5..15d5f4e 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -43,16 +43,16 @@
void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::F32, DataType::F16);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(d, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::F32, DataType::F16, DataType::QS8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(d, 1, DataType::F32, DataType::F16, DataType::QS8);
if(c != nullptr)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16, DataType::QS8);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
+ ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != d->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != d->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
}
@@ -60,8 +60,8 @@
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d);
ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
- // Check if the first input tensor is a vector and the data type is F32. If so, all the kernels for reshaping the tensors can be skipped
- if((a->info()->dimension(1) == 1) && (a->info()->data_type() == DataType::F32))
+ // Check if the first input tensor is a vector. If so, all the kernels for reshaping the tensors can be skipped
+ if((a->info()->dimension(1) == 1))
{
_run_vector_matrix_multiplication = true;
@@ -94,14 +94,20 @@
break;
}
#endif
+ case DataType::QS8:
+ {
+ shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+ shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.0f));
+ break;
+ }
default:
{
ARM_COMPUTE_ERROR_ON("Data type not supported");
}
}
- TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
- TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+ TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
+ TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
_tmp_a.allocator()->init(info_a);
_tmp_b.allocator()->init(info_b);
@@ -133,18 +139,18 @@
if(!_run_vector_matrix_multiplication)
{
// Run interleave kernel
- NEScheduler::get().multithread(&_interleave_kernel);
+ NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
// Run transpose kernel
- NEScheduler::get().multithread(&_transpose_kernel);
+ NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
}
// Run matrix multiply kernel
- NEScheduler::get().multithread(&_mm_kernel, _run_vector_matrix_multiplication ? 0 : 1);
+ NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
// Run matrix addition kernel
if(_run_addition)
{
- NEScheduler::get().multithread(&_ma_kernel);
+ NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
}
}
diff --git a/src/runtime/NEON/functions/NEGEMMLowp.cpp b/src/runtime/NEON/functions/NEGEMMLowp.cpp
index 3866f28..b64f769 100644
--- a/src/runtime/NEON/functions/NEGEMMLowp.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowp.cpp
@@ -49,14 +49,14 @@
ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
- /* The interleaved output matrix will have the following shape: [ a_height * 4, a_width / 4 ] */
+ /* The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] */
TensorShape shape_tmp_a = a->info()->tensor_shape();
shape_tmp_a.set(0, a->info()->dimension(0) * 4);
shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
TensorShape shape_tmp_b = b->info()->tensor_shape();
- shape_tmp_b.set(0, b->info()->dimension(1) * 4);
- shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 4.f));
+ shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+ shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
@@ -74,11 +74,11 @@
void NEGEMMLowp::run()
{
/* Run interleave kernel */
- NEScheduler::get().multithread(&_interleave_kernel);
+ NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
/* Run transpose kernel */
- NEScheduler::get().multithread(&_transpose_kernel);
+ NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
/* Run matrix multiply kernel */
- NEScheduler::get().multithread(&_mm_kernel);
+ NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
index 8cba30d..dc40ece 100644
--- a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
+++ b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
@@ -34,11 +34,6 @@
void NEGEMMTranspose1xW::configure(const ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::U8);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(1) * 4);
- ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(input->info()->dimension(0) / 4.0f));
auto k = arm_compute::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
k->configure(input, output);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp
index 34447b1..5ccc765 100644
--- a/src/runtime/NEON/functions/NEGaussian5x5.cpp
+++ b/src/runtime/NEON/functions/NEGaussian5x5.cpp
@@ -55,6 +55,6 @@
void NEGaussian5x5::run()
{
_border_handler.run(_border_handler.window());
- NEScheduler::get().multithread(&_kernel_hor);
- NEScheduler::get().multithread(&_kernel_vert);
+ NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
+ NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index cb8296b..e1d64f1 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
@@ -108,8 +108,8 @@
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
_border_handler[i].run(_border_handler[i].window());
- NEScheduler::get().multithread(_horizontal_reduction.get() + i);
- NEScheduler::get().multithread(_vertical_reduction.get() + i);
+ NEScheduler::get().schedule(_horizontal_reduction.get() + i, Window::DimY);
+ NEScheduler::get().schedule(_vertical_reduction.get() + i, Window::DimY);
}
}
@@ -178,6 +178,6 @@
for(unsigned int i = 0; i < num_levels - 1; ++i)
{
_gaus5x5[i].run();
- NEScheduler::get().multithread(_scale_nearest.get() + i);
+ NEScheduler::get().schedule(_scale_nearest.get() + i, Window::DimY);
}
}
diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
index a5073b9..a592f53 100644
--- a/src/runtime/NEON/functions/NEHOGDescriptor.cpp
+++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
@@ -92,8 +92,8 @@
_gradient.run();
// Run orientation binning kernel
- NEScheduler::get().multithread(&_orient_bin);
+ NEScheduler::get().schedule(&_orient_bin, Window::DimY);
// Run block normalization kernel
- NEScheduler::get().multithread(&_block_norm);
+ NEScheduler::get().schedule(&_block_norm, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEHOGDetector.cpp b/src/runtime/NEON/functions/NEHOGDetector.cpp
index f0d6121..e8ed29d 100644
--- a/src/runtime/NEON/functions/NEHOGDetector.cpp
+++ b/src/runtime/NEON/functions/NEHOGDetector.cpp
@@ -31,8 +31,6 @@
void NEHOGDetector::configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
{
auto k = arm_compute::cpp14::make_unique<NEHOGDetectorKernel>();
-
k->configure(input, hog, detection_windows, detection_window_stride, threshold, idx_class);
-
_kernel = std::move(k);
-}
+}
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
index c5b37f4..2f4b880 100644
--- a/src/runtime/NEON/functions/NEHOGGradient.cpp
+++ b/src/runtime/NEON/functions/NEHOGGradient.cpp
@@ -76,5 +76,5 @@
_derivative.run();
// Run magnitude/phase kernel
- NEScheduler::get().multithread(_mag_phase.get());
+ NEScheduler::get().schedule(_mag_phase.get(), Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
index effa64f..173b8f4 100644
--- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
@@ -115,7 +115,7 @@
_orient_bin_kernel = arm_compute::cpp14::make_unique<NEHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
_block_norm_kernel = arm_compute::cpp14::make_unique<NEHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
_hog_detect_kernel = arm_compute::cpp14::make_unique<NEHOGDetector[]>(_num_hog_detect_kernel);
- _non_maxima_kernel = arm_compute::cpp14::make_unique<NEHOGNonMaximaSuppressionKernel>();
+ _non_maxima_kernel = arm_compute::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
_hog_space = arm_compute::cpp14::make_unique<Tensor[]>(_num_orient_bin_kernel);
_hog_norm_space = arm_compute::cpp14::make_unique<Tensor[]>(_num_block_norm_kernel);
@@ -208,13 +208,13 @@
// Run orientation binning kernel
for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
{
- NEScheduler::get().multithread(_orient_bin_kernel.get() + i);
+ NEScheduler::get().schedule(_orient_bin_kernel.get() + i, Window::DimY);
}
// Run block normalization kernel
for(size_t i = 0; i < _num_block_norm_kernel; ++i)
{
- NEScheduler::get().multithread(_block_norm_kernel.get() + i);
+ NEScheduler::get().schedule(_block_norm_kernel.get() + i, Window::DimY);
}
// Run HOG detector kernel
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index adefd47..b54fb67 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp
@@ -199,13 +199,13 @@
_border_gy.run(_border_gy.window());
// Run harris score kernel
- NEScheduler::get().multithread(_harris_score.get());
+ NEScheduler::get().schedule(_harris_score.get(), Window::DimY);
// Run non-maxima suppression
_non_max_suppr.run();
// Run corner candidate kernel
- NEScheduler::get().multithread(&_candidates);
+ NEScheduler::get().schedule(&_candidates, Window::DimY);
// Run sort & euclidean distance
_sort_euclidean.run(_sort_euclidean.window());
diff --git a/src/runtime/NEON/functions/NEHistogram.cpp b/src/runtime/NEON/functions/NEHistogram.cpp
index 6747f2e..c42b2a5 100644
--- a/src/runtime/NEON/functions/NEHistogram.cpp
+++ b/src/runtime/NEON/functions/NEHistogram.cpp
@@ -54,5 +54,5 @@
void NEHistogram::run()
{
// Calculate histogram of input.
- NEScheduler::get().multithread(&_histogram_kernel);
+ NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
new file mode 100644
index 0000000..85d7ba3
--- /dev/null
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+NELocallyConnectedLayer::NELocallyConnectedLayer()
+ : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+{
+}
+
+void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+ ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+ ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
+ }
+
+ bool _has_bias = (biases != nullptr);
+ _is_first_run = true;
+
+ // Get parameters for conv_info
+ unsigned int stride_x = 0;
+ unsigned int stride_y = 0;
+ unsigned int pad_x = 0;
+ unsigned int pad_y = 0;
+ std::tie(stride_x, stride_y) = conv_info.stride();
+ std::tie(pad_x, pad_y) = conv_info.pad();
+
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+ std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
+ stride_x, stride_y, pad_x, pad_y, conv_info.round());
+
+ ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+ ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+ // Create tensor to store the reshaped weights
+ const size_t mat_weights_cols = weights->info()->dimension(3);
+ const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+ const size_t mat_weights_num = weights->info()->dimension(4);
+
+ const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
+
+ _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
+
+ // Create tensor to store im2col reshaped inputs
+ const size_t mat_input_cols = mat_weights_rows;
+ const size_t mat_input_rows = conv_w * conv_h;
+ TensorShape shape_im2col = input->info()->tensor_shape();
+ shape_im2col.set(0, mat_input_cols);
+ shape_im2col.set(1, mat_input_rows);
+ shape_im2col.set(2, 1);
+
+ _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+
+ // Create locally connected layer output tensor
+ TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, mat_input_rows);
+ _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+
+ // Configure kernels
+ _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+ _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+ _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
+ _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+ // Allocate intermediate tensors
+ _weights_reshaped.allocator()->allocate();
+ _input_im2col_reshaped.allocator()->allocate();
+ _gemm_output.allocator()->allocate();
+}
+
+void NELocallyConnectedLayer::run()
+{
+ // Run weights reshaping (Runs once for every configure)
+ if(_is_first_run)
+ {
+ _is_first_run = false;
+ NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+ }
+
+ // Run input reshaping
+ NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
+
+ // Runs GEMM on reshaped matrices
+ NEScheduler::get().schedule(&_mm_kernel, Window::DimX);
+
+ // Reshape output matrix
+ NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEMeanStdDev.cpp b/src/runtime/NEON/functions/NEMeanStdDev.cpp
index 3fb5769..47143f5 100644
--- a/src/runtime/NEON/functions/NEMeanStdDev.cpp
+++ b/src/runtime/NEON/functions/NEMeanStdDev.cpp
@@ -43,5 +43,5 @@
_global_sum = 0;
_global_sum_squared = 0;
- NEScheduler::get().multithread(&_mean_stddev_kernel);
+ NEScheduler::get().schedule(&_mean_stddev_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEMinMaxLocation.cpp b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
index ba73ef9..cab9200 100644
--- a/src/runtime/NEON/functions/NEMinMaxLocation.cpp
+++ b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
@@ -43,8 +43,8 @@
_min_max.reset();
/* Run min max kernel */
- NEScheduler::get().multithread(&_min_max);
+ NEScheduler::get().schedule(&_min_max, Window::DimY);
/* Run min max location */
- NEScheduler::get().multithread(&_min_max_loc);
+ NEScheduler::get().schedule(&_min_max_loc, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index ff38e61..69ff325 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -41,7 +41,7 @@
{
ARM_COMPUTE_ERROR_ON(input == nullptr);
- TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type());
+ TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
_input_squared.allocator()->init(tensor_info);
// Configure kernels
@@ -55,7 +55,7 @@
void NENormalizationLayer::run()
{
- NEScheduler::get().multithread(&_multiply_kernel);
- NEScheduler::get().multithread(&_border_handler);
- NEScheduler::get().multithread(&_norm_kernel);
+ NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_border_handler, Window::DimY);
+ NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
index 993153b..49135e4 100644
--- a/src/runtime/NEON/functions/NEOpticalFlow.cpp
+++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp
@@ -113,7 +113,7 @@
// Run Scharr kernel
_func_scharr[level - 1].run();
- /* Run Lucas-Kanade kernel */
- NEScheduler::get().multithread(_kernel_tracker.get() + level - 1, Window::DimX);
+ // Run Lucas-Kanade kernel
+ NEScheduler::get().schedule(_kernel_tracker.get() + level - 1, Window::DimX);
}
}
diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp
index 1859b30..8967a22 100644
--- a/src/runtime/NEON/functions/NESobel5x5.cpp
+++ b/src/runtime/NEON/functions/NESobel5x5.cpp
@@ -76,6 +76,6 @@
void NESobel5x5::run()
{
_border_handler.run(_border_handler.window());
- NEScheduler::get().multithread(&_sobel_hor);
- NEScheduler::get().multithread(&_sobel_vert);
+ NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
+ NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp
index 8af5e8d..f628da9 100644
--- a/src/runtime/NEON/functions/NESobel7x7.cpp
+++ b/src/runtime/NEON/functions/NESobel7x7.cpp
@@ -76,6 +76,6 @@
void NESobel7x7::run()
{
_border_handler.run(_border_handler.window());
- NEScheduler::get().multithread(&_sobel_hor);
- NEScheduler::get().multithread(&_sobel_vert);
+ NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
+ NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 55d4d3a..0651eab 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -32,21 +32,22 @@
using namespace arm_compute;
NESoftmaxLayer::NESoftmaxLayer()
- : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _fill_border_kernel_sum(), _max(), _sum(), _tmp()
+ : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _max(), _sum(), _tmp()
{
}
void NESoftmaxLayer::configure(ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
// Create intermediate tensors shapes
- _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
+ TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
+ _tmp.allocator()->init(tensor_info_tmp);
TensorShape shape = input->info()->tensor_shape();
shape.set(0, 1);
- TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type());
+ TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
_max.allocator()->init(tensor_info_max_sum);
_sum.allocator()->init(tensor_info_max_sum);
@@ -55,9 +56,6 @@
_shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
_norm_kernel.configure(&_tmp, &_sum, output);
_fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::CONSTANT, PixelValue(-FLT_MAX));
- // Fill the border around tmp buffer with sensible negative value.
- // This avoids exp(-FLT_MAX) which will lead to -inf and destroy the calculation of sum when input is not a multiple of processed elements
- _fill_border_kernel_sum.configure(input, _shift_exp_sum_kernel.border_size(), BorderMode::CONSTANT, PixelValue(-50.f));
// Allocate intermediate tensors
_tmp.allocator()->allocate();
@@ -67,9 +65,8 @@
void NESoftmaxLayer::run()
{
- NEScheduler::get().multithread(&_fill_border_kernel);
- NEScheduler::get().multithread(&_max_kernel);
- NEScheduler::get().multithread(&_fill_border_kernel_sum);
- NEScheduler::get().multithread(&_shift_exp_sum_kernel);
- NEScheduler::get().multithread(&_norm_kernel);
+ NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_max_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_shift_exp_sum_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
}