blob: 3d82e3f0d9a84839ceddb571270dcef6b6ff741e [file] [log] [blame]
Kaizen8938bd32017-09-28 14:38:23 +01001/*
Jenkins514be652019-02-28 12:25:18 +00002 * Copyright (c) 2017-2019 ARM Limited.
Kaizen8938bd32017-09-28 14:38:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
25
26#include "arm_compute/core/CL/ICLTensor.h"
27#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
28#include "arm_compute/core/Error.h"
29#include "arm_compute/core/PixelValue.h"
30#include "arm_compute/core/TensorInfo.h"
31#include "arm_compute/core/Validate.h"
32#include "arm_compute/runtime/CL/CLScheduler.h"
33#include "arm_compute/runtime/Tensor.h"
34#include "support/ToolchainSupport.h"
35
36using namespace arm_compute;
37
Jenkinsb3a371b2018-05-23 11:36:53 +010038namespace
39{
Jenkinsb9abeae2018-11-22 11:58:08 +000040unsigned int calculate_number_of_stages(const ITensorInfo *input, unsigned int axis)
Jenkinsb3a371b2018-05-23 11:36:53 +010041{
Jenkinsb9abeae2018-11-22 11:58:08 +000042 // We need only 1 stage for all axis except x-axis and x-axis for QASYMM8.
43 if(axis != 0 || (axis == 0 && is_data_type_quantized(input->data_type())))
44 {
45 return 1;
46 }
Jenkinsb3a371b2018-05-23 11:36:53 +010047 // Calculate number of WGs. 16 elements per thread, 8 threads per WG
48 const unsigned int num_of_wg = ceil(input->dimension(0) / 128.f);
49
50 // Calculate number of stages. First stage performs op and the rest reduction sum
51 // depending on the size of the input. Last stage should have only 1 WG.
52 const unsigned int num_of_stages = num_of_wg / 128 + 2;
53
54 return num_of_stages;
55}
56} // namespace
57
Kaizen8938bd32017-09-28 14:38:23 +010058CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
Jenkins514be652019-02-28 12:25:18 +000059 : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_serial()
Kaizen8938bd32017-09-28 14:38:23 +010060{
61}
62
Jenkinsb3a371b2018-05-23 11:36:53 +010063Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
64{
Jenkinsb9abeae2018-11-22 11:58:08 +000065 const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
Jenkins514be652019-02-28 12:25:18 +000066 bool is_serial = is_data_type_quantized(input->data_type()) || axis != 0;
67 if(is_serial)
68 {
69 ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
70 }
71 else
Jenkinsb3a371b2018-05-23 11:36:53 +010072 {
Jenkinsb9abeae2018-11-22 11:58:08 +000073 // Create temporary tensor infos
74 auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
75
76 // Create intermediate tensor info
77 TensorShape shape{ input->tensor_shape() };
78
79 for(unsigned int i = 0; i < num_of_stages - 1; i++)
80 {
81 shape.set(0, ceil(shape.x() / 128.f));
82 sums_vector[i].set_data_type(input->data_type());
83 sums_vector[i].set_tensor_shape(shape);
84 sums_vector[i].set_num_channels(input->num_channels());
85 }
86
87 ReductionOperation first_kernel_op;
Jenkins514be652019-02-28 12:25:18 +000088 ReductionOperation intermediate_kernel_op;
Jenkinsb9abeae2018-11-22 11:58:08 +000089 ReductionOperation last_kernel_op;
90 switch(op)
91 {
92 case ReductionOperation::SUM:
93 case ReductionOperation::MEAN_SUM:
Jenkins514be652019-02-28 12:25:18 +000094 first_kernel_op = ReductionOperation::SUM;
95 intermediate_kernel_op = ReductionOperation::SUM;
96 last_kernel_op = op;
Jenkinsb9abeae2018-11-22 11:58:08 +000097 break;
98 case ReductionOperation::SUM_SQUARE:
Jenkins514be652019-02-28 12:25:18 +000099 first_kernel_op = ReductionOperation::SUM_SQUARE;
100 intermediate_kernel_op = ReductionOperation::SUM;
101 last_kernel_op = ReductionOperation::SUM;
102 break;
103 case ReductionOperation::PROD:
104 first_kernel_op = ReductionOperation::PROD;
105 intermediate_kernel_op = ReductionOperation::PROD;
106 last_kernel_op = ReductionOperation::PROD;
Jenkinsb9abeae2018-11-22 11:58:08 +0000107 break;
108 default:
109 ARM_COMPUTE_ERROR("Not supported");
110 }
111
112 // Validate ReductionOperation only on first kernel
113 ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, sums_vector.get(), axis, first_kernel_op));
114
115 // Validate ReductionOperation on intermediate stages
116 for(unsigned int i = 1; i < num_of_stages - 1; ++i)
117 {
Jenkins514be652019-02-28 12:25:18 +0000118 ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, intermediate_kernel_op));
Jenkinsb9abeae2018-11-22 11:58:08 +0000119 }
120
121 // Validate ReductionOperation on the last stage
122 const unsigned int last_stage = num_of_stages - 1;
123 ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input->dimension(0)));
Jenkinsb3a371b2018-05-23 11:36:53 +0100124 }
Jenkinsb3a371b2018-05-23 11:36:53 +0100125
Jenkinsb3a371b2018-05-23 11:36:53 +0100126 return Status{};
127}
128
Kaizen8938bd32017-09-28 14:38:23 +0100129void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
130{
Jenkinsb9abeae2018-11-22 11:58:08 +0000131 _num_of_stages = calculate_number_of_stages(input->info(), axis);
132 _reduction_axis = axis;
Jenkins514be652019-02-28 12:25:18 +0000133 _is_serial = is_data_type_quantized(input->info()->data_type()) || axis != 0;
Kaizen8938bd32017-09-28 14:38:23 +0100134
135 // Configure reduction operation kernels
136 _reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
Kaizen8938bd32017-09-28 14:38:23 +0100137
Jenkinsb9abeae2018-11-22 11:58:08 +0000138 // Create temporary tensors
Jenkins514be652019-02-28 12:25:18 +0000139 if(_is_serial)
140 {
141 _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
142 }
143 else
Kaizen8938bd32017-09-28 14:38:23 +0100144 {
Jenkinsb9abeae2018-11-22 11:58:08 +0000145 _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
Jenkins514be652019-02-28 12:25:18 +0000146 _results_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
Jenkinsb9abeae2018-11-22 11:58:08 +0000147 TensorShape shape{ input->info()->tensor_shape() };
148 for(unsigned int i = 0; i < _num_of_stages - 1; i++)
149 {
150 shape.set(0, ceil(shape.x() / 128.f));
Jenkins514be652019-02-28 12:25:18 +0000151 _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
Jenkinsb9abeae2018-11-22 11:58:08 +0000152 }
153
154 // Apply ReductionOperation only on first kernel
Jenkins514be652019-02-28 12:25:18 +0000155 _memory_group.manage(_results_vector.get());
Jenkinsb9abeae2018-11-22 11:58:08 +0000156
157 ReductionOperation first_kernel_op;
Jenkins514be652019-02-28 12:25:18 +0000158 ReductionOperation intermediate_kernel_op;
Jenkinsb9abeae2018-11-22 11:58:08 +0000159 ReductionOperation last_kernel_op;
Jenkins514be652019-02-28 12:25:18 +0000160 PixelValue pixelValue;
Jenkinsb9abeae2018-11-22 11:58:08 +0000161 switch(op)
162 {
163 case ReductionOperation::SUM:
164 case ReductionOperation::MEAN_SUM:
Jenkins514be652019-02-28 12:25:18 +0000165 first_kernel_op = ReductionOperation::SUM;
166 intermediate_kernel_op = ReductionOperation::SUM;
167 last_kernel_op = op;
168 pixelValue = PixelValue();
Jenkinsb9abeae2018-11-22 11:58:08 +0000169 break;
170 case ReductionOperation::SUM_SQUARE:
Jenkins514be652019-02-28 12:25:18 +0000171 first_kernel_op = ReductionOperation::SUM_SQUARE;
172 intermediate_kernel_op = ReductionOperation::SUM;
173 last_kernel_op = ReductionOperation::SUM;
174 pixelValue = PixelValue();
175 break;
176 case ReductionOperation::PROD:
177 first_kernel_op = ReductionOperation::PROD;
178 intermediate_kernel_op = ReductionOperation::PROD;
179 last_kernel_op = ReductionOperation::PROD;
180 pixelValue = PixelValue(1, input->info()->data_type());
Jenkinsb9abeae2018-11-22 11:58:08 +0000181 break;
182 default:
183 ARM_COMPUTE_ERROR("Not supported");
184 }
185
Jenkins514be652019-02-28 12:25:18 +0000186 _reduction_kernels_vector[0].configure(input, _results_vector.get(), axis, first_kernel_op);
187 _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
Jenkinsb9abeae2018-11-22 11:58:08 +0000188
189 // Apply ReductionOperation on intermediate stages
190 for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
191 {
Jenkins514be652019-02-28 12:25:18 +0000192 _memory_group.manage(_results_vector.get() + i);
193 _reduction_kernels_vector[i].configure(_results_vector.get() + i - 1, _results_vector.get() + i, axis, intermediate_kernel_op);
194 _border_handlers_vector[i].configure(_results_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
195 _results_vector[i - 1].allocator()->allocate();
Jenkinsb9abeae2018-11-22 11:58:08 +0000196 }
197
198 // Apply ReductionOperation on the last stage
199 const unsigned int last_stage = _num_of_stages - 1;
200 const unsigned int input_width = input->info()->dimension(0);
Jenkins514be652019-02-28 12:25:18 +0000201 _reduction_kernels_vector[last_stage].configure(_results_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
202 _border_handlers_vector[last_stage].configure(_results_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
203 _results_vector[last_stage - 1].allocator()->allocate();
Kaizen8938bd32017-09-28 14:38:23 +0100204 }
Kaizen8938bd32017-09-28 14:38:23 +0100205}
206
207void CLReductionOperation::run()
208{
209 _memory_group.acquire();
210
Jenkins514be652019-02-28 12:25:18 +0000211 if(_is_serial)
212 {
213 CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
214 }
215 else
Kaizen8938bd32017-09-28 14:38:23 +0100216 {
Jenkinsb9abeae2018-11-22 11:58:08 +0000217 for(unsigned int i = 0; i < _num_of_stages; ++i)
218 {
219 CLScheduler::get().enqueue(_border_handlers_vector[i], false);
220 CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
221 }
222 }
Kaizen8938bd32017-09-28 14:38:23 +0100223
224 _memory_group.release();
Jenkinsb3a371b2018-05-23 11:36:53 +0100225}